diff --git a/.gitattributes b/.gitattributes
index 277a2e4c12ce11015736c998ee87238a26f36d8b..cff8e4d38e46eca75579188b248c2d6c5c3655a3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -122,3 +122,9 @@ phivenv/Lib/site-packages/torch/lib/libprotoc.lib filter=lfs diff=lfs merge=lfs
 phivenv/Lib/site-packages/torch/lib/pthreadpool.lib filter=lfs diff=lfs merge=lfs -text
 phivenv/Lib/site-packages/torch/lib/microkernels-prod.lib filter=lfs diff=lfs merge=lfs -text
 phivenv/Lib/site-packages/torch/lib/sleef.lib filter=lfs diff=lfs merge=lfs -text
+phivenv/Lib/site-packages/torch/lib/torch_cpu.lib filter=lfs diff=lfs merge=lfs -text
+phivenv/Lib/site-packages/torch/lib/torch_python.dll filter=lfs diff=lfs merge=lfs -text
+phivenv/Lib/site-packages/torch/lib/torch_python.lib filter=lfs diff=lfs merge=lfs -text
+phivenv/Lib/site-packages/torch/lib/uv.dll filter=lfs diff=lfs merge=lfs -text
+phivenv/Lib/site-packages/torch/lib/XNNPACK.lib filter=lfs diff=lfs merge=lfs -text
+phivenv/Lib/site-packages/torch/linalg/__pycache__/__init__.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/phivenv/Lib/site-packages/torch/lib/XNNPACK.lib b/phivenv/Lib/site-packages/torch/lib/XNNPACK.lib
new file mode 100644
index 0000000000000000000000000000000000000000..6fb9720705ab12d6c6f047d05f3677ce4a8b9f6e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/lib/XNNPACK.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bf5c98f694f4587f5a191739ea8dd565a0696828448828e7491b9c8ca5d6fe2
+size 14049460
diff --git a/phivenv/Lib/site-packages/torch/lib/torch_cpu.lib b/phivenv/Lib/site-packages/torch/lib/torch_cpu.lib
new file mode 100644
index 0000000000000000000000000000000000000000..e975a29666d4580db102caccc06152740c9ac10d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/lib/torch_cpu.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08b81393191ac47ebf63e92aad8b65ece890d86dd51eb1e7294f1be3e496f3d7
+size 29046564
diff --git a/phivenv/Lib/site-packages/torch/lib/torch_python.dll b/phivenv/Lib/site-packages/torch/lib/torch_python.dll
new file mode 100644
index 0000000000000000000000000000000000000000..d9c3c71ff0777290164817d32b0ebddd6e64d1ec
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/lib/torch_python.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cd2897d163f1029341f3ae67fcccd5f4f4fe7b4d62ecae8ca767128e3140f73
+size 16310272
diff --git a/phivenv/Lib/site-packages/torch/lib/torch_python.lib b/phivenv/Lib/site-packages/torch/lib/torch_python.lib
new file mode 100644
index 0000000000000000000000000000000000000000..273ad0f9dce1e3dd0fe9a27545b52495f78b897b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/lib/torch_python.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8910178ebb14175b5c8b9ccd27b38b2360297399273b6dd3312bcc733b779529
+size 287836
diff --git a/phivenv/Lib/site-packages/torch/lib/uv.dll b/phivenv/Lib/site-packages/torch/lib/uv.dll
new file mode 100644
index 0000000000000000000000000000000000000000..698fa11ebd9e267ef04d2a9f4dc3ad71a6df9532
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/lib/uv.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa569e682fc5fb7a8eb94c6829af9f30a569748dbbc6bce39735d48bc960bcf8
+size 195072
diff --git a/phivenv/Lib/site-packages/torch/linalg/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/torch/linalg/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd0a9f0b7c2ad0a19cc448243ab9041e9c1b43be
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/linalg/__pycache__/__init__.cpython-39.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa23740f58a645a003b0df576478da22f00c55c5001053fba842208036eb483
+size 113386
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/__init__.py b/phivenv/Lib/site-packages/transformers/models/d_fine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..879b53709bc673bcf28553a51175f06fa1e362c0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/d_fine/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_d_fine import *
+    from .modeling_d_fine import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cee44ef89aeb006c3b24866817bbbb04092f0a31
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/configuration_d_fine.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/configuration_d_fine.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2376460e9d0f1b1cfbdb5d6ddd4f0a6b91003390
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/configuration_d_fine.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/modeling_d_fine.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/modeling_d_fine.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48d848c4cade54737a3b5edf2787f0f3c29be87c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/modeling_d_fine.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/modular_d_fine.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/modular_d_fine.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41ccda9db980d8c81536361a40296cd9ad74d796
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/d_fine/__pycache__/modular_d_fine.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/configuration_d_fine.py b/phivenv/Lib/site-packages/transformers/models/d_fine/configuration_d_fine.py
new file mode 100644
index 0000000000000000000000000000000000000000..7484d9a347e534f3ebfff5a0776a0413a5a416dc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/d_fine/configuration_d_fine.py
@@ -0,0 +1,433 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/d_fine/modular_d_fine.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_d_fine.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO: Attribute map assignment logic should be fixed in modular
+# as well as super() call parsing because otherwise we cannot re-write args after initialization
+class DFineConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`DFineModel`]. It is used to instantiate a D-FINE
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of D-FINE-X-COCO "[ustc-community/dfine-xlarge-coco"](https://huggingface.co/ustc-community/dfine-xlarge-coco").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_bias_prior_prob (`float`, *optional*):
+            The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
+            If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
+            Whether to freeze the batch normalization layers in the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        encoder_hidden_dim (`int`, *optional*, defaults to 256):
+            Dimension of the layers in hybrid encoder.
+        encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
+            Multi level features input for encoder.
+        feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`):
+            Strides used in each feature map.
+        encoder_layers (`int`, *optional*, defaults to 1):
+            Total of layers to be used by the encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The ratio for all dropout layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`):
+            Indexes of the projected layers to be used in the encoder.
+        positional_encoding_temperature (`int`, *optional*, defaults to 10000):
+            The temperature parameter used to create the positional encodings.
+        encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        activation_function (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the general layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        eval_size (`tuple[int, int]`, *optional*):
+            Height and width used to computes the effective height and width of the position embeddings after taking
+            into account the stride.
+        normalize_before (`bool`, *optional*, defaults to `False`):
+            Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
+            feed-forward modules.
+        hidden_expansion (`float`, *optional*, defaults to 1.0):
+            Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers exclude hybrid encoder.
+        num_queries (`int`, *optional*, defaults to 300):
+            Number of object queries.
+        decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
+            Multi level features dimension for decoder
+        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        num_feature_levels (`int`, *optional*, defaults to 3):
+            The number of input feature levels.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_denoising (`int`, *optional*, defaults to 100):
+            The total number of denoising tasks or queries to be used for contrastive denoising.
+        label_noise_ratio (`float`, *optional*, defaults to 0.5):
+            The fraction of denoising labels to which random noise should be added.
+        box_noise_scale (`float`, *optional*, defaults to 1.0):
+            Scale or magnitude of noise to be added to the bounding boxes.
+        learn_initial_query (`bool`, *optional*, defaults to `False`):
+            Indicates whether the initial query embeddings for the decoder should be learned during training
+        anchor_image_size (`tuple[int, int]`, *optional*):
+            Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied.
+        with_box_refine (`bool`, *optional*, defaults to `True`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the architecture has an encoder decoder structure.
+        matcher_alpha (`float`, *optional*, defaults to 0.25):
+            Parameter alpha used by the Hungarian Matcher.
+        matcher_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used by the Hungarian Matcher.
+        matcher_class_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the class loss used by the Hungarian Matcher.
+        matcher_bbox_cost (`float`, *optional*, defaults to 5.0):
+            The relative weight of the bounding box loss used by the Hungarian Matcher.
+        matcher_giou_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the giou loss of used by the Hungarian Matcher.
+        use_focal_loss (`bool`, *optional*, defaults to `True`):
+            Parameter informing if focal focal should be used.
+        auxiliary_loss (`bool`, *optional*, defaults to `True`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        focal_loss_alpha (`float`, *optional*, defaults to 0.75):
+            Parameter alpha used to compute the focal loss.
+        focal_loss_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used to compute the focal loss.
+        weight_loss_vfl (`float`, *optional*, defaults to 1.0):
+            Relative weight of the varifocal loss in the object detection loss.
+        weight_loss_bbox (`float`, *optional*, defaults to 5.0):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        weight_loss_giou (`float`, *optional*, defaults to 2.0):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        weight_loss_fgl (`float`, *optional*, defaults to 0.15):
+            Relative weight of the fine-grained localization loss in the object detection loss.
+        weight_loss_ddf (`float`, *optional*, defaults to 1.5):
+            Relative weight of the decoupled distillation focal loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.0001):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        eval_idx (`int`, *optional*, defaults to -1):
+            Index of the decoder layer to use for evaluation. If negative, counts from the end
+            (e.g., -1 means use the last layer). This allows for early prediction in the decoder
+            stack while still training later layers.
+        layer_scale (`float`, *optional*, defaults to `1.0`):
+            Scaling factor for the hidden dimension in later decoder layers. Used to adjust the
+            model capacity after the evaluation layer.
+        max_num_bins (`int`, *optional*, defaults to 32):
+            Maximum number of bins for the distribution-guided bounding box refinement.
+            Higher values allow for more fine-grained localization but increase computation.
+        reg_scale (`float`, *optional*, defaults to 4.0):
+            Scale factor for the regression distribution. Controls the range and granularity
+            of the bounding box refinement process.
+        depth_mult (`float`, *optional*, defaults to 1.0):
+            Multiplier for the number of blocks in RepNCSPELAN4 layers. Used to scale the model's
+            depth while maintaining its architecture.
+        top_prob_values (`int`, *optional*, defaults to 4):
+            Number of top probability values to consider from each corner's distribution.
+        lqe_hidden_dim (`int`, *optional*, defaults to 64):
+            Hidden dimension size for the Location Quality Estimator (LQE) network.
+        lqe_layers (`int`, *optional*, defaults to 2):
+            Number of layers in the Location Quality Estimator MLP.
+        decoder_offset_scale (`float`, *optional*, defaults to 0.5):
+            Offset scale used in deformable attention.
+        decoder_method (`str`, *optional*, defaults to `"default"`):
+            The method to use for the decoder: `"default"` or `"discrete"`.
+        up (`float`, *optional*, defaults to 0.5):
+            Controls the upper bounds of the Weighting Function.
+    """
+
+    model_type = "d_fine"
+    layer_types = ["basic", "bottleneck"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        initializer_range=0.01,
+        initializer_bias_prior_prob=None,
+        layer_norm_eps=1e-5,
+        batch_norm_eps=1e-5,
+        # backbone
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        freeze_backbone_batch_norms=True,
+        backbone_kwargs=None,
+        # encoder HybridEncoder
+        encoder_hidden_dim=256,
+        encoder_in_channels=[512, 1024, 2048],
+        feat_strides=[8, 16, 32],
+        encoder_layers=1,
+        encoder_ffn_dim=1024,
+        encoder_attention_heads=8,
+        dropout=0.0,
+        activation_dropout=0.0,
+        encode_proj_layers=[2],
+        positional_encoding_temperature=10000,
+        encoder_activation_function="gelu",
+        activation_function="silu",
+        eval_size=None,
+        normalize_before=False,
+        hidden_expansion=1.0,
+        # decoder DFineTransformer
+        d_model=256,
+        num_queries=300,
+        decoder_in_channels=[256, 256, 256],
+        decoder_ffn_dim=1024,
+        num_feature_levels=3,
+        decoder_n_points=4,
+        decoder_layers=6,
+        decoder_attention_heads=8,
+        decoder_activation_function="relu",
+        attention_dropout=0.0,
+        num_denoising=100,
+        label_noise_ratio=0.5,
+        box_noise_scale=1.0,
+        learn_initial_query=False,
+        anchor_image_size=None,
+        with_box_refine=True,
+        is_encoder_decoder=True,
+        # Loss
+        matcher_alpha=0.25,
+        matcher_gamma=2.0,
+        matcher_class_cost=2.0,
+        matcher_bbox_cost=5.0,
+        matcher_giou_cost=2.0,
+        use_focal_loss=True,
+        auxiliary_loss=True,
+        focal_loss_alpha=0.75,
+        focal_loss_gamma=2.0,
+        weight_loss_vfl=1.0,
+        weight_loss_bbox=5.0,
+        weight_loss_giou=2.0,
+        weight_loss_fgl=0.15,
+        weight_loss_ddf=1.5,
+        eos_coefficient=1e-4,
+        eval_idx=-1,
+        layer_scale=1,
+        max_num_bins=32,
+        reg_scale=4.0,
+        depth_mult=1.0,
+        top_prob_values=4,
+        lqe_hidden_dim=64,
+        lqe_layers=2,
+        decoder_offset_scale=0.5,
+        decoder_method="default",
+        up=0.5,
+        **kwargs,
+    ):
+        self.initializer_range = initializer_range
+        self.initializer_bias_prior_prob = initializer_bias_prior_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.batch_norm_eps = batch_norm_eps
+        # backbone
+        if backbone_config is None and backbone is None:
+            logger.info(
+                "`backbone_config` and `backbone` are `None`. Initializing the config with the default `HGNet-V2` backbone."
+            )
+            backbone_model_type = "hgnet_v2"
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            # this will map it to RTDetrResNetConfig
+            # note: we can instead create HGNetV2Config
+            # and we would need to create HGNetV2Backbone
+            backbone_config = config_class(
+                num_channels=3,
+                embedding_size=64,
+                hidden_sizes=[256, 512, 1024, 2048],
+                depths=[3, 4, 6, 3],
+                layer_type="bottleneck",
+                hidden_act="relu",
+                downsample_in_first_stage=False,
+                downsample_in_bottleneck=False,
+                out_features=None,
+                out_indices=[2, 3, 4],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
+        self.backbone_kwargs = backbone_kwargs
+        # encoder
+        self.encoder_hidden_dim = encoder_hidden_dim
+        self.encoder_in_channels = encoder_in_channels
+        self.feat_strides = feat_strides
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.encode_proj_layers = encode_proj_layers
+        self.encoder_layers = encoder_layers
+        self.positional_encoding_temperature = positional_encoding_temperature
+        self.eval_size = eval_size
+        self.normalize_before = normalize_before
+        self.encoder_activation_function = encoder_activation_function
+        self.activation_function = activation_function
+        self.hidden_expansion = hidden_expansion
+        # decoder
+        self.d_model = d_model
+        self.num_queries = num_queries
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_in_channels = decoder_in_channels
+        self.num_feature_levels = num_feature_levels
+        self.decoder_n_points = decoder_n_points
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_activation_function = decoder_activation_function
+        self.attention_dropout = attention_dropout
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        self.learn_initial_query = learn_initial_query
+        self.anchor_image_size = anchor_image_size
+        self.auxiliary_loss = auxiliary_loss
+        self.with_box_refine = with_box_refine
+        # Loss
+        self.matcher_alpha = matcher_alpha
+        self.matcher_gamma = matcher_gamma
+        self.matcher_class_cost = matcher_class_cost
+        self.matcher_bbox_cost = matcher_bbox_cost
+        self.matcher_giou_cost = matcher_giou_cost
+        self.use_focal_loss = use_focal_loss
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        self.weight_loss_vfl = weight_loss_vfl
+        self.weight_loss_bbox = weight_loss_bbox
+        self.weight_loss_giou = weight_loss_giou
+        self.weight_loss_fgl = weight_loss_fgl
+        self.weight_loss_ddf = weight_loss_ddf
+        self.eos_coefficient = eos_coefficient
+        # add the new attributes with the given values or defaults
+        self.eval_idx = eval_idx
+        self.layer_scale = layer_scale
+        self.max_num_bins = max_num_bins
+        self.reg_scale = reg_scale
+        self.depth_mult = depth_mult
+        self.decoder_offset_scale = decoder_offset_scale
+        self.decoder_method = decoder_method
+        self.top_prob_values = top_prob_values
+        self.lqe_hidden_dim = lqe_hidden_dim
+        self.lqe_layers = lqe_layers
+        self.up = up
+
+        if isinstance(self.decoder_n_points, list):
+            if len(self.decoder_n_points) != self.num_feature_levels:
+                raise ValueError(
+                    f"Length of decoder_n_points list ({len(self.decoder_n_points)}) must match num_feature_levels ({self.num_feature_levels})."
+                )
+
+        head_dim = self.d_model // self.decoder_attention_heads
+        if head_dim * self.decoder_attention_heads != self.d_model:
+            raise ValueError(
+                f"Embedded dimension {self.d_model} must be divisible by decoder_attention_heads {self.decoder_attention_heads}"
+            )
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+    @classmethod
+    def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`DFineConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model
+        configuration.
+
+            Args:
+                backbone_config ([`PretrainedConfig`]):
+                    The backbone configuration.
+
+            Returns:
+                [`DFineConfig`]: An instance of a configuration object
+        """
+        return cls(
+            backbone_config=backbone_config,
+            **kwargs,
+        )
+
+
+__all__ = ["DFineConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/modeling_d_fine.py b/phivenv/Lib/site-packages/transformers/models/d_fine/modeling_d_fine.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cc2f5e221d10c1150b1200cf24880ad87201ad4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/d_fine/modeling_d_fine.py
@@ -0,0 +1,2188 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/d_fine/modular_d_fine.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_d_fine.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch import Tensor, nn
+
+from ...activations import ACT2CLS, ACT2FN
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import ModelOutput, auto_docstring, is_torchdynamo_compiling, torch_int
+from ...utils.backbone_utils import load_backbone
+from .configuration_d_fine import DFineConfig
+
+
+def multi_scale_deformable_attention_v2(
+    value: Tensor,
+    value_spatial_shapes: Tensor,
+    sampling_locations: Tensor,
+    attention_weights: Tensor,
+    num_points_list: list[int],
+    method="default",
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points = sampling_locations.shape
+    value_list = (
+        value.permute(0, 2, 3, 1)
+        .flatten(0, 1)
+        .split([height * width for height, width in value_spatial_shapes], dim=-1)
+    )
+    # sampling_offsets [8, 480, 8, 12, 2]
+    if method == "default":
+        sampling_grids = 2 * sampling_locations - 1
+    elif method == "discrete":
+        sampling_grids = sampling_locations
+    sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1)
+    sampling_grids = sampling_grids.split(num_points_list, dim=-2)
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = value_list[level_id].reshape(batch_size * num_heads, hidden_dim, height, width)
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[level_id]
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        if method == "default":
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+            )
+        elif method == "discrete":
+            sampling_coord = (sampling_grid_l_ * torch.tensor([[width, height]], device=value.device) + 0.5).to(
+                torch.int64
+            )
+
+            # Separate clamping for x and y coordinates
+            sampling_coord_x = sampling_coord[..., 0].clamp(0, width - 1)
+            sampling_coord_y = sampling_coord[..., 1].clamp(0, height - 1)
+
+            # Combine the clamped coordinates
+            sampling_coord = torch.stack([sampling_coord_x, sampling_coord_y], dim=-1)
+            sampling_coord = sampling_coord.reshape(batch_size * num_heads, num_queries * num_points_list[level_id], 2)
+            sampling_idx = (
+                torch.arange(sampling_coord.shape[0], device=value.device)
+                .unsqueeze(-1)
+                .repeat(1, sampling_coord.shape[1])
+            )
+            sampling_value_l_ = value_l_[sampling_idx, :, sampling_coord[..., 1], sampling_coord[..., 0]]
+            sampling_value_l_ = sampling_value_l_.permute(0, 2, 1).reshape(
+                batch_size * num_heads, hidden_dim, num_queries, num_points_list[level_id]
+            )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.permute(0, 2, 1, 3).reshape(
+        batch_size * num_heads, 1, num_queries, sum(num_points_list)
+    )
+    output = (
+        (torch.concat(sampling_value_list, dim=-1) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+class DFineMultiscaleDeformableAttention(nn.Module):
+    def __init__(self, config: DFineConfig):
+        """
+        D-Fine version of multiscale deformable attention
+        """
+        super().__init__()
+        self.d_model = config.d_model
+        self.n_heads = config.decoder_attention_heads
+        self.n_levels = config.num_feature_levels
+        self.offset_scale = config.decoder_offset_scale
+        self.decoder_method = config.decoder_method
+        self.n_points = config.decoder_n_points
+
+        if isinstance(self.n_points, list):
+            num_points_list = self.n_points
+        else:
+            num_points_list = [self.n_points for _ in range(self.n_levels)]
+
+        self.num_points_list = num_points_list
+        num_points_scale = [1 / n for n in self.num_points_list for _ in range(n)]
+        self.register_buffer("num_points_scale", torch.tensor(num_points_scale, dtype=torch.float32))
+
+        self.total_points = self.n_heads * sum(self.num_points_list)
+
+        self.sampling_offsets = nn.Linear(self.d_model, self.total_points * 2)
+        self.attention_weights = nn.Linear(self.d_model, self.total_points)
+
+        self.ms_deformable_attn_core = multi_scale_deformable_attention_v2
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        reference_points=None,
+        encoder_hidden_states=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+
+        if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        # Reshape for multi-head attention
+        value = encoder_hidden_states.reshape(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        if attention_mask is not None:
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+
+        sampling_offsets: torch.Tensor = self.sampling_offsets(hidden_states)
+        sampling_offsets = sampling_offsets.reshape(
+            batch_size, num_queries, self.n_heads, sum(self.num_points_list), 2
+        )
+
+        attention_weights = self.attention_weights(hidden_states).reshape(
+            batch_size, num_queries, self.n_heads, sum(self.num_points_list)
+        )
+        attention_weights = F.softmax(attention_weights, dim=-1)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.n_levels, 1, 2)
+            sampling_locations = (
+                reference_points.reshape(batch_size, sequence_length, 1, self.n_levels, 1, 2)
+                + sampling_offsets / offset_normalizer
+            )
+        elif reference_points.shape[-1] == 4:
+            # reference_points [8, 480, None, 1,  4]
+            # sampling_offsets [8, 480, 8,    12, 2]
+            num_points_scale = self.num_points_scale.to(dtype=hidden_states.dtype).unsqueeze(-1)
+            offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale
+            sampling_locations = reference_points[:, :, None, :, :2] + offset
+        else:
+            raise ValueError(
+                f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead."
+            )
+
+        output = self.ms_deformable_attn_core(
+            value,
+            spatial_shapes_list,
+            sampling_locations,
+            attention_weights,
+            self.num_points_list,
+            self.decoder_method,
+        )
+
+        return output, attention_weights
+
+
+class DFineGate(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.gate = nn.Linear(2 * d_model, 2 * d_model)
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_input = torch.cat([second_residual, hidden_states], dim=-1)
+        gates = torch.sigmoid(self.gate(gate_input))
+        gate1, gate2 = gates.chunk(2, dim=-1)
+        hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states)
+        return hidden_states
+
+
+class DFineMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._reshape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._reshape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._reshape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [seq_len, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = attention_mask.expand(batch_size, 1, *attention_mask.size())
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            if attention_mask.dtype == torch.bool:
+                attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_(
+                    attention_mask, -torch.inf
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class DFineDecoderLayer(nn.Module):
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        # self-attention
+        self.self_attn = DFineMultiheadAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.decoder_activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+
+        # override the encoder attention module with d-fine version
+        self.encoder_attn = DFineMultiscaleDeformableAttention(config=config)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model)
+        self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        # gate
+        self.gateway = DFineGate(config.d_model)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Any, Any]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        # Self Attention
+        hidden_states_2, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + hidden_states_2
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings
+        hidden_states_2, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+        )
+
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
+        hidden_states = self.gateway(residual, hidden_states_2)
+
+        # Fully Connected
+        hidden_states_2 = self.activation_fn(self.fc1(hidden_states))
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.activation_dropout, training=self.training)
+        hidden_states_2 = self.fc2(hidden_states_2)
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + hidden_states_2
+        hidden_states = self.final_layer_norm(hidden_states.clamp(min=-65504, max=65504))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+@auto_docstring
+class DFinePreTrainedModel(PreTrainedModel):
+    config: DFineConfig
+    base_model_prefix = "d_fine"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"DFineHybridEncoder", r"DFineDecoderLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # initialize linear layer bias value according to a given probability value.
+        if isinstance(module, (DFineForObjectDetection, DFineDecoder)):
+            if module.class_embed is not None:
+                for layer in module.class_embed:
+                    prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+                    bias = float(-math.log((1 - prior_prob) / prior_prob))
+                    nn.init.xavier_uniform_(layer.weight)
+                    nn.init.constant_(layer.bias, bias)
+
+            if module.bbox_embed is not None:
+                for layer in module.bbox_embed:
+                    nn.init.constant_(layer.layers[-1].weight, 0)
+                    nn.init.constant_(layer.layers[-1].bias, 0)
+
+        if isinstance(module, DFineMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values
+            grid_init = grid_init.reshape(module.n_heads, 1, 2).tile([1, sum(module.num_points_list), 1])
+            scaling = torch.concat([torch.arange(1, n + 1) for n in module.num_points_list]).reshape(1, -1, 1)
+            grid_init *= scaling
+            with torch.no_grad():
+                module.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+
+        if isinstance(module, DFineModel):
+            prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+            bias = float(-math.log((1 - prior_prob) / prior_prob))
+            nn.init.xavier_uniform_(module.enc_score_head.weight)
+            nn.init.constant_(module.enc_score_head.bias, bias)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+        if isinstance(module, DFineGate):
+            bias = float(-math.log((1 - 0.5) / 0.5))
+            init.constant_(module.gate.bias, bias)
+            init.constant_(module.gate.weight, 0)
+
+        if isinstance(module, DFineLQE):
+            init.constant_(module.reg_conf.layers[-1].bias, 0)
+            init.constant_(module.reg_conf.layers[-1].weight, 0)
+
+        if hasattr(module, "weight_embedding") and self.config.learn_initial_query:
+            nn.init.xavier_uniform_(module.weight_embedding.weight)
+        if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:
+            nn.init.xavier_uniform_(module.denoising_class_embed.weight)
+
+
+class DFineIntegral(nn.Module):
+    """
+    A static layer that calculates integral results from a distribution.
+
+    This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
+    where Pr(n) is the softmax probability vector representing the discrete
+    distribution, and W(n) is the non-uniform Weighting Function.
+
+    Args:
+        max_num_bins (int): Max number of the discrete bins. Default is 32.
+                       It can be adjusted based on the dataset or task requirements.
+    """
+
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.max_num_bins = config.max_num_bins
+
+    def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor:
+        batch_size, num_queries, _ = pred_corners.shape
+        pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1)
+        pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4)
+        pred_corners = pred_corners.reshape(batch_size, num_queries, -1)
+        return pred_corners
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the DFineDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+    """
+)
+class DFineDecoderOutput(ModelOutput):
+    r"""
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked initial reference points (initial reference points of each layer of the decoder).
+    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+        used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor:
+    """
+    Generates the non-uniform Weighting Function W(n) for bounding box regression.
+
+    Args:
+        max_num_bins (int): Max number of the discrete bins.
+        up (Tensor): Controls upper bounds of the sequence,
+                     where maximum offset is ±up * H / W.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+                           Larger values result in flatter weights near the central axis W(max_num_bins/2)=0
+                           and steeper weights at both ends.
+    Returns:
+        Tensor: Sequence of Weighting Function.
+    """
+    upper_bound1 = abs(up[0]) * abs(reg_scale)
+    upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
+    step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2))
+    left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)]
+    right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)]
+    values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
+    values = torch.cat(values, 0)
+    return values
+
+
+def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor:
+    """
+    Decodes edge-distances into bounding box coordinates.
+
+    Args:
+        points (`torch.Tensor`):
+            (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
+        distance (`torch.Tensor`):
+            (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries.
+        reg_scale (`float`):
+            Controls the curvature of the Weighting Function.
+    Returns:
+        `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
+    """
+    reg_scale = abs(reg_scale)
+    top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
+    top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
+    bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
+    bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
+
+    bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1)
+
+    return corners_to_center_format(bboxes)
+
+
+class DFineDecoder(DFinePreTrainedModel):
+    """
+    D-FINE Decoder implementing Fine-grained Distribution Refinement (FDR).
+
+    This decoder refines object detection predictions through iterative updates across multiple layers,
+    utilizing attention mechanisms, location quality estimators, and distribution refinement techniques
+    to improve bounding box accuracy and robustness.
+    """
+
+    def __init__(self, config: DFineConfig):
+        super().__init__(config)
+        self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList(
+            [DFineDecoderLayer(config) for _ in range(config.decoder_layers)]
+            + [DFineDecoderLayer(config) for _ in range(config.decoder_layers - self.eval_idx - 1)]
+        )
+        self.query_pos_head = DFineMLPPredictionHead(config, 4, 2 * config.d_model, config.d_model, num_layers=2)
+
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+        self.reg_scale = nn.Parameter(torch.tensor([config.reg_scale]), requires_grad=False)
+        self.max_num_bins = config.max_num_bins
+        self.d_model = config.d_model
+        self.layer_scale = config.layer_scale
+        self.pre_bbox_head = DFineMLP(config.hidden_size, config.hidden_size, 4, 3)
+        self.integral = DFineIntegral(config)
+        self.num_head = config.decoder_attention_heads
+        self.up = nn.Parameter(torch.tensor([config.up]), requires_grad=False)
+        self.lqe_layers = nn.ModuleList([DFineLQE(config) for _ in range(config.decoder_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        encoder_hidden_states: torch.Tensor,
+        reference_points: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        spatial_shapes,
+        level_start_index=None,
+        spatial_shapes_list=None,
+        output_hidden_states=None,
+        encoder_attention_mask=None,
+        memory_mask=None,
+        output_attentions=None,
+        return_dict=None,
+    ) -> DFineDecoderOutput:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+        intermediate_logits = ()
+        intermediate_predicted_corners = ()
+        initial_reference_points = ()
+
+        output_detach = pred_corners_undetach = 0
+
+        project = weighting_function(self.max_num_bins, self.up, self.reg_scale)
+        ref_points_detach = F.sigmoid(reference_points)
+
+        for i, decoder_layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = self.query_pos_head(ref_points_detach).clamp(min=-10, max=10)
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            output = decoder_layer(
+                hidden_states=hidden_states,
+                position_embeddings=query_pos_embed,
+                reference_points=ref_points_input,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = output[0]
+
+            if i == 0:
+                # Initial bounding box predictions with inverse sigmoid refinement
+                new_reference_points = F.sigmoid(self.pre_bbox_head(output[0]) + inverse_sigmoid(ref_points_detach))
+                ref_points_initial = new_reference_points.detach()
+
+            # Refine bounding box corners using FDR, integrating previous layer's corrections
+            if self.bbox_embed is not None:
+                pred_corners = self.bbox_embed[i](hidden_states + output_detach) + pred_corners_undetach
+                inter_ref_bbox = distance2bbox(
+                    ref_points_initial, self.integral(pred_corners, project), self.reg_scale
+                )
+                pred_corners_undetach = pred_corners
+                ref_points_detach = inter_ref_bbox.detach()
+
+            output_detach = hidden_states.detach()
+
+            intermediate += (hidden_states,)
+
+            if self.class_embed is not None and (self.training or i == self.eval_idx):
+                scores = self.class_embed[i](hidden_states)
+                # Add initial logits and reference points with pre-bbox head
+                if i == 0:
+                    intermediate_logits += (scores,)
+                    intermediate_reference_points += (new_reference_points,)
+                # Lqe does not affect the performance here.
+                scores = self.lqe_layers[i](scores, pred_corners)
+                intermediate_logits += (scores,)
+                intermediate_reference_points += (inter_ref_bbox,)
+                initial_reference_points += (ref_points_initial,)
+                intermediate_predicted_corners += (pred_corners,)
+
+            if output_attentions:
+                all_self_attns += (output[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (output[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate)
+        if self.class_embed is not None and self.bbox_embed is not None:
+            intermediate_logits = torch.stack(intermediate_logits, dim=1)
+            intermediate_predicted_corners = torch.stack(intermediate_predicted_corners, dim=1)
+            initial_reference_points = torch.stack(initial_reference_points, dim=1)
+            intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_logits,
+                    intermediate_reference_points,
+                    intermediate_predicted_corners,
+                    initial_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+
+        return DFineDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_logits=intermediate_logits,
+            intermediate_reference_points=intermediate_reference_points,
+            intermediate_predicted_corners=intermediate_predicted_corners,
+            initial_reference_points=initial_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the RT-DETR encoder-decoder model.
+    """
+)
+class DFineModelOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Initial reference points used for the first decoder layer.
+    init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
+        Logits of predicted bounding boxes coordinates in the encoder stage.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    denoising_meta_values (`dict`):
+        Extra dictionary for the denoising related values.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    enc_topk_logits: Optional[torch.FloatTensor] = None
+    enc_topk_bboxes: Optional[torch.FloatTensor] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    denoising_meta_values: Optional[dict] = None
+
+
+class DFineFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DFineFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DFineFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class DFineConvEncoder(nn.Module):
+    """
+    Convolutional backbone using the modeling_d_fine_resnet.py.
+
+    nn.BatchNorm2d layers are replaced by DFineFrozenBatchNorm2d as defined above.
+    https://github.com/lyuwenyu/RT-DETR/blob/main/DFine_pytorch/src/nn/backbone/presnet.py#L142
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        backbone = load_backbone(config)
+
+        if config.freeze_backbone_batch_norms:
+            # replace batch norm by frozen batch norm
+            with torch.no_grad():
+                replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.channels
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+def get_contrastive_denoising_training_group(
+    targets,
+    num_classes,
+    num_queries,
+    class_embed,
+    num_denoising_queries=100,
+    label_noise_ratio=0.5,
+    box_noise_scale=1.0,
+):
+    """
+    Creates a contrastive denoising training group using ground-truth samples. It adds noise to labels and boxes.
+
+    Args:
+        targets (`list[dict]`):
+            The target objects, each containing 'class_labels' and 'boxes' for objects in an image.
+        num_classes (`int`):
+            Total number of classes in the dataset.
+        num_queries (`int`):
+            Number of query slots in the transformer.
+        class_embed (`callable`):
+            A function or a model layer to embed class labels.
+        num_denoising_queries (`int`, *optional*, defaults to 100):
+            Number of denoising queries.
+        label_noise_ratio (`float`, *optional*, defaults to 0.5):
+            Ratio of noise applied to labels.
+        box_noise_scale (`float`, *optional*, defaults to 1.0):
+            Scale of noise applied to bounding boxes.
+    Returns:
+        `tuple` comprising various elements:
+        - **input_query_class** (`torch.FloatTensor`) --
+          Class queries with applied label noise.
+        - **input_query_bbox** (`torch.FloatTensor`) --
+          Bounding box queries with applied box noise.
+        - **attn_mask** (`torch.FloatTensor`) --
+           Attention mask for separating denoising and reconstruction queries.
+        - **denoising_meta_values** (`dict`) --
+          Metadata including denoising positive indices, number of groups, and split sizes.
+    """
+
+    if num_denoising_queries <= 0:
+        return None, None, None, None
+
+    num_ground_truths = [len(t["class_labels"]) for t in targets]
+    device = targets[0]["class_labels"].device
+
+    max_gt_num = max(num_ground_truths)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_groups_denoising_queries = num_denoising_queries // max_gt_num
+    num_groups_denoising_queries = 1 if num_groups_denoising_queries == 0 else num_groups_denoising_queries
+    # pad gt to max_num of a batch
+    batch_size = len(num_ground_truths)
+
+    input_query_class = torch.full([batch_size, max_gt_num], num_classes, dtype=torch.int32, device=device)
+    input_query_bbox = torch.zeros([batch_size, max_gt_num, 4], device=device)
+    pad_gt_mask = torch.zeros([batch_size, max_gt_num], dtype=torch.bool, device=device)
+
+    for i in range(batch_size):
+        num_gt = num_ground_truths[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets[i]["class_labels"]
+            input_query_bbox[i, :num_gt] = targets[i]["boxes"]
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_groups_denoising_queries])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_groups_denoising_queries, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_groups_denoising_queries])
+    # positive and negative mask
+    negative_gt_mask = torch.zeros([batch_size, max_gt_num * 2, 1], device=device)
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_groups_denoising_queries, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    denoise_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+    denoise_positive_idx = torch.split(
+        denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths]
+    )
+    # total denoising queries
+    num_denoising_queries = torch_int(max_gt_num * 2 * num_groups_denoising_queries)
+
+    if label_noise_ratio > 0:
+        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+        # randomly put a new one here
+        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+    if box_noise_scale > 0:
+        known_bbox = center_to_corners_format(input_query_bbox)
+        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = torch.rand_like(input_query_bbox)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = corners_to_center_format(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    input_query_class = class_embed(input_query_class)
+
+    target_size = num_denoising_queries + num_queries
+    attn_mask = torch.full([target_size, target_size], 0, dtype=torch.float, device=device)
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising_queries:, :num_denoising_queries] = -torch.inf
+
+    # reconstructions cannot see each other
+    for i in range(num_groups_denoising_queries):
+        idx_block_start = max_gt_num * 2 * i
+        idx_block_end = max_gt_num * 2 * (i + 1)
+        attn_mask[idx_block_start:idx_block_end, :idx_block_start] = -torch.inf
+        attn_mask[idx_block_start:idx_block_end, idx_block_end:num_denoising_queries] = -torch.inf
+
+    denoising_meta_values = {
+        "dn_positive_idx": denoise_positive_idx,
+        "dn_num_group": num_groups_denoising_queries,
+        "dn_num_split": [num_denoising_queries, num_queries],
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, denoising_meta_values
+
+
+@auto_docstring(
+    custom_intro="""
+    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
+    """
+)
+class DFineModel(DFinePreTrainedModel):
+    def __init__(self, config: DFineConfig):
+        super().__init__(config)
+
+        # Create backbone
+        self.backbone = DFineConvEncoder(config)
+        intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
+        num_backbone_outs = len(config.decoder_in_channels)
+        encoder_input_proj_list = []
+        for _ in range(num_backbone_outs):
+            in_channels = intermediate_channel_sizes[_]
+            encoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.encoder_hidden_dim, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(config.encoder_hidden_dim),
+                )
+            )
+        self.encoder_input_proj = nn.ModuleList(encoder_input_proj_list)
+        self.encoder = DFineHybridEncoder(config=config)
+
+        # denoising part
+        if config.num_denoising > 0:
+            self.denoising_class_embed = nn.Embedding(
+                config.num_labels + 1, config.d_model, padding_idx=config.num_labels
+            )
+
+        # decoder embedding
+        if config.learn_initial_query:
+            self.weight_embedding = nn.Embedding(config.num_queries, config.d_model)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model),
+            nn.LayerNorm(config.d_model, eps=config.layer_norm_eps),
+        )
+        self.enc_score_head = nn.Linear(config.d_model, config.num_labels)
+        self.enc_bbox_head = DFineMLPPredictionHead(config, config.d_model, config.d_model, 4, num_layers=3)
+
+        # init encoder output anchors and valid_mask
+        if config.anchor_image_size:
+            self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype)
+        num_backbone_outs = len(config.decoder_in_channels)
+        decoder_input_proj_list = []
+        for _ in range(num_backbone_outs):
+            in_channels = config.decoder_in_channels[_]
+            decoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+                )
+            )
+        for _ in range(config.num_feature_levels - num_backbone_outs):
+            decoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False),
+                    nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+                )
+            )
+            in_channels = config.d_model
+        self.decoder = DFineDecoder(config)
+        decoder_input_proj = []
+        in_channels = config.decoder_in_channels[-1]
+        for _ in range(num_backbone_outs):
+            if config.hidden_size == config.decoder_in_channels[-1]:
+                decoder_input_proj.append(nn.Identity())
+            else:
+                conv = nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False)
+                batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps)
+                decoder_input_proj.append(nn.Sequential(conv, batchnorm))
+        for _ in range(config.num_feature_levels - num_backbone_outs):
+            if config.hidden_size == config.decoder_in_channels[-1]:
+                decoder_input_proj.append(nn.Identity())
+            else:
+                conv = nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False)
+                batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps)
+                decoder_input_proj.append(nn.Sequential(conv, batchnorm))
+        self.decoder_input_proj = nn.ModuleList(decoder_input_proj)
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def freeze_backbone(self):
+        for param in self.backbone.parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for param in self.backbone.parameters():
+            param.requires_grad_(True)
+
+    @compile_compatible_method_lru_cache(maxsize=32)
+    def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32):
+        if spatial_shapes is None:
+            spatial_shapes = [
+                [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
+                for s in self.config.feat_strides
+            ]
+        anchors = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(
+                torch.arange(end=height, device=device).to(dtype),
+                torch.arange(end=width, device=device).to(dtype),
+                indexing="ij",
+            )
+            grid_xy = torch.stack([grid_x, grid_y], -1)
+            grid_xy = grid_xy.unsqueeze(0) + 0.5
+            grid_xy[..., 0] /= width
+            grid_xy[..., 1] /= height
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
+            anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
+        # define the valid range for anchor coordinates
+        eps = 1e-2
+        anchors = torch.concat(anchors, 1)
+        valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.tensor(torch.finfo(dtype).max, dtype=dtype, device=device))
+
+        return anchors, valid_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DFineModelOutput]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DFineModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("PekingU/DFine_r50vd")
+        >>> model = DFineModel.from_pretrained("PekingU/DFine_r50vd")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        features = self.backbone(pixel_values, pixel_mask)
+
+        proj_feats = [self.encoder_input_proj[level](source) for level, (source, mask) in enumerate(features)]
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                proj_feats,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if output_hidden_states else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2
+                else encoder_outputs[1]
+                if output_attentions
+                else None,
+            )
+
+        # Equivalent to def _get_encoder_input
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/DFine_pytorch/src/zoo/DFine/DFine_decoder.py#L412
+        sources = []
+        for level, source in enumerate(encoder_outputs[0]):
+            sources.append(self.decoder_input_proj[level](source))
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            sources.append(self.decoder_input_proj[_len_sources](encoder_outputs[0])[-1])
+            for i in range(_len_sources + 1, self.config.num_feature_levels):
+                sources.append(self.decoder_input_proj[i](encoder_outputs[0][-1]))
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        spatial_shapes_list = []
+        spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long)
+        for level, source in enumerate(sources):
+            height, width = source.shape[-2:]
+            spatial_shapes[level, 0] = height
+            spatial_shapes[level, 1] = width
+            spatial_shapes_list.append((height, width))
+            source = source.flatten(2).transpose(1, 2)
+            source_flatten.append(source)
+        source_flatten = torch.cat(source_flatten, 1)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        # prepare denoising training
+        if self.training and self.config.num_denoising > 0 and labels is not None:
+            (
+                denoising_class,
+                denoising_bbox_unact,
+                attention_mask,
+                denoising_meta_values,
+            ) = get_contrastive_denoising_training_group(
+                targets=labels,
+                num_classes=self.config.num_labels,
+                num_queries=self.config.num_queries,
+                class_embed=self.denoising_class_embed,
+                num_denoising_queries=self.config.num_denoising,
+                label_noise_ratio=self.config.label_noise_ratio,
+                box_noise_scale=self.config.box_noise_scale,
+            )
+        else:
+            denoising_class, denoising_bbox_unact, attention_mask, denoising_meta_values = None, None, None, None
+
+        batch_size = len(source_flatten)
+        device = source_flatten.device
+        dtype = source_flatten.dtype
+
+        # prepare input for decoder
+        if self.training or self.config.anchor_image_size is None:
+            # Pass spatial_shapes as tuple to make it hashable and make sure
+            # lru_cache is working for generate_anchors()
+            spatial_shapes_tuple = tuple(spatial_shapes_list)
+            anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
+        else:
+            anchors, valid_mask = self.anchors, self.valid_mask
+            anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
+
+        # use the valid_mask to selectively retain values in the feature map where the mask is `True`
+        memory = valid_mask.to(source_flatten.dtype) * source_flatten
+
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_logits = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.config.num_queries, dim=1)
+
+        reference_points_unact = enc_outputs_coord_logits.gather(
+            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_logits.shape[-1])
+        )
+
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
+
+        enc_topk_logits = enc_outputs_class.gather(
+            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
+        )
+
+        # extract region features
+        if self.config.learn_initial_query:
+            target = self.weight_embedding.tile([batch_size, 1, 1])
+        else:
+            target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+            target = target.detach()
+
+        if denoising_class is not None:
+            target = torch.concat([denoising_class, target], 1)
+
+        init_reference_points = reference_points_unact.detach()
+
+        # decoder
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            encoder_hidden_states=source_flatten,
+            encoder_attention_mask=attention_mask,
+            reference_points=init_reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(
+                value
+                for value in [enc_topk_logits, enc_topk_bboxes, enc_outputs_class, enc_outputs_coord_logits]
+                if value is not None
+            )
+            dn_outputs = tuple(value if value is not None else None for value in [denoising_meta_values])
+            tuple_outputs = decoder_outputs + encoder_outputs + (init_reference_points,) + enc_outputs + dn_outputs
+
+            return tuple_outputs
+
+        return DFineModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_logits=decoder_outputs.intermediate_logits,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            intermediate_predicted_corners=decoder_outputs.intermediate_predicted_corners,
+            initial_reference_points=decoder_outputs.initial_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            init_reference_points=init_reference_points,
+            enc_topk_logits=enc_topk_logits,
+            enc_topk_bboxes=enc_topk_bboxes,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+            denoising_meta_values=denoising_meta_values,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`DFineForObjectDetection`].
+    """
+)
+class DFineObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~DFineImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized (absolute) bounding boxes.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked initial reference points (initial reference points of each layer of the decoder).
+    init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the encoder.
+    enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the encoder.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    denoising_meta_values (`dict`):
+        Extra dictionary for the denoising related values
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    init_reference_points: Optional[tuple[torch.FloatTensor]] = None
+    enc_topk_logits: Optional[torch.FloatTensor] = None
+    enc_topk_bboxes: Optional[torch.FloatTensor] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    denoising_meta_values: Optional[dict] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further
+    decoded into scores and classes.
+    """
+)
+class DFineForObjectDetection(DFinePreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = ["bbox_embed", "class_embed"]
+    # We can't initialize the model on meta device as some weights are modified during the initialization
+    _no_split_modules = None
+
+    def __init__(self, config: DFineConfig):
+        super().__init__(config)
+
+        # D-FINE encoder-decoder model
+        self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx
+        self.model = DFineModel(config)
+        scaled_dim = round(config.layer_scale * config.hidden_size)
+        num_pred = config.decoder_layers
+        self.class_embed = nn.ModuleList([nn.Linear(config.d_model, config.num_labels) for _ in range(num_pred)])
+        self.bbox_embed = nn.ModuleList(
+            [
+                DFineMLP(config.hidden_size, config.hidden_size, 4 * (config.max_num_bins + 1), 3)
+                for _ in range(self.eval_idx + 1)
+            ]
+            + [
+                DFineMLP(scaled_dim, scaled_dim, 4 * (config.max_num_bins + 1), 3)
+                for _ in range(config.decoder_layers - self.eval_idx - 1)
+            ]
+        )
+
+        # here self.model.decoder.bbox_embed is null, but not self.bbox_embed
+        self.model.decoder.class_embed = self.class_embed
+        self.model.decoder.bbox_embed = self.bbox_embed
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)]
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]:
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers.image_utils import load_image
+        >>> from transformers import AutoImageProcessor, DFineForObjectDetection
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = load_image(url)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-xlarge-coco")
+        >>> model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-xlarge-coco")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+        >>> list(logits.shape)
+        [1, 300, 80]
+
+        >>> boxes = outputs.pred_boxes
+        >>> list(boxes.shape)
+        [1, 300, 4]
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)
+        >>> result = results[0]  # first image in batch
+
+        >>> for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.958 at location [344.49, 23.4, 639.84, 374.27]
+        Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33]
+        Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57]
+        Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74]
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        denoising_meta_values = (
+            outputs.denoising_meta_values if return_dict else outputs[-1] if self.training else None
+        )
+
+        outputs_class = outputs.intermediate_logits if return_dict else outputs[2]
+        outputs_coord = outputs.intermediate_reference_points if return_dict else outputs[3]
+        predicted_corners = outputs.intermediate_predicted_corners if return_dict else outputs[4]
+        initial_reference_points = outputs.initial_reference_points if return_dict else outputs[5]
+
+        logits = outputs_class[:, -1]
+        pred_boxes = outputs_coord[:, -1]
+
+        loss, loss_dict, auxiliary_outputs, enc_topk_logits, enc_topk_bboxes = None, None, None, None, None
+        if labels is not None:
+            enc_topk_logits = outputs.enc_topk_logits if return_dict else outputs[-5]
+            enc_topk_bboxes = outputs.enc_topk_bboxes if return_dict else outputs[-4]
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                outputs_class,
+                outputs_coord,
+                enc_topk_logits=enc_topk_logits,
+                enc_topk_bboxes=enc_topk_bboxes,
+                denoising_meta_values=denoising_meta_values,
+                predicted_corners=predicted_corners,
+                initial_reference_points=initial_reference_points,
+                **kwargs,
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + (auxiliary_outputs,) + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return DFineObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_logits=outputs.intermediate_logits,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            intermediate_predicted_corners=outputs.intermediate_predicted_corners,
+            initial_reference_points=outputs.initial_reference_points,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            init_reference_points=outputs.init_reference_points,
+            enc_topk_logits=outputs.enc_topk_logits,
+            enc_topk_bboxes=outputs.enc_topk_bboxes,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            denoising_meta_values=outputs.denoising_meta_values,
+        )
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class DFineMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/DFine_paddle/ppdet/modeling/transformers/utils.py#L453
+
+    """
+
+    def __init__(self, config, input_dim, d_model, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [d_model] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class DFineMLP(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"):
+        super().__init__()
+        self.num_layers = num_layers
+        hidden_dims = [hidden_dim] * (num_layers - 1)
+        input_dims = [input_dim] + hidden_dims
+        output_dims = hidden_dims + [output_dim]
+        self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims))
+        self.act = ACT2CLS[act]()
+
+    def forward(self, stat_features: torch.Tensor) -> torch.Tensor:
+        for i, layer in enumerate(self.layers):
+            stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features)
+        return stat_features
+
+
+class DFineLQE(nn.Module):
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.top_prob_values = config.top_prob_values
+        self.max_num_bins = config.max_num_bins
+        self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers)
+
+    def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor:
+        batch_size, length, _ = pred_corners.size()
+        prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1)
+        prob_topk, _ = prob.topk(self.top_prob_values, dim=-1)
+        stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
+        quality_score = self.reg_conf(stat.reshape(batch_size, length, -1))
+        scores = scores + quality_score
+        return scores
+
+
+class DFineConvNormLayer(nn.Module):
+    def __init__(
+        self,
+        config: DFineConfig,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        groups: int = 1,
+        padding: Optional[int] = None,
+        activation: Optional[str] = None,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups=groups,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+        self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.norm(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class DFineRepVggBlock(nn.Module):
+    """
+    RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
+    """
+
+    def __init__(self, config: DFineConfig, in_channels: int, out_channels: int):
+        super().__init__()
+
+        activation = config.activation_function
+        hidden_channels = in_channels
+        self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1)
+        self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, x):
+        y = self.conv1(x) + self.conv2(x)
+        return self.activation(y)
+
+
+class DFineCSPRepLayer(nn.Module):
+    """
+    Cross Stage Partial (CSP) network layer with RepVGG blocks.
+    """
+
+    def __init__(
+        self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0
+    ):
+        super().__init__()
+        in_channels = in_channels
+        out_channels = out_channels
+        activation = config.activation_function
+
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.bottlenecks = nn.ModuleList(
+            [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)]
+        )
+        if hidden_channels != out_channels:
+            self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state_1 = self.conv1(hidden_state)
+        for bottleneck in self.bottlenecks:
+            hidden_state_1 = bottleneck(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
+        hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2)
+        return hidden_state_3
+
+
+class DFineRepNCSPELAN4(nn.Module):
+    def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3):
+        super().__init__()
+        conv1_dim = config.encoder_hidden_dim * 2
+        conv2_dim = config.encoder_hidden_dim
+        conv3_dim = config.encoder_hidden_dim * 2
+        conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2)
+        self.conv_dim = conv3_dim // 2
+        self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act)
+        self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks)
+        self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
+        self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks)
+        self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
+        self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act)
+
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        # Split initial features into two branches after first convolution
+        split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1))
+
+        # Process branches sequentially
+        branch1 = self.csp_rep1(split_features[-1])
+        branch1 = self.conv2(branch1)
+        branch2 = self.csp_rep2(branch1)
+        branch2 = self.conv3(branch2)
+
+        split_features.extend([branch1, branch2])
+        merged_features = torch.cat(split_features, 1)
+        merged_features = self.conv4(merged_features)
+        return merged_features
+
+
+class DFineSCDown(nn.Module):
+    def __init__(self, config: DFineConfig, kernel_size: int, stride: int):
+        super().__init__()
+        self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1)
+        self.conv2 = DFineConvNormLayer(
+            config,
+            config.encoder_hidden_dim,
+            config.encoder_hidden_dim,
+            kernel_size,
+            stride,
+            config.encoder_hidden_dim,
+        )
+
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        input_features = self.conv1(input_features)
+        input_features = self.conv2(input_features)
+        return input_features
+
+
+class DFineEncoderLayer(nn.Module):
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.normalize_before = config.normalize_before
+
+        # self-attention
+        self.self_attn = DFineMultiheadAttention(
+            embed_dim=config.encoder_hidden_dim,
+            num_heads=config.num_attention_heads,
+            dropout=config.dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.encoder_activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(config.encoder_hidden_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, config.encoder_hidden_dim)
+        self.final_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        residual = hidden_states
+
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DFineEncoder(nn.Module):
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+
+        self.layers = nn.ModuleList([DFineEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+    def forward(self, src, src_mask=None, pos_embed=None, output_attentions: bool = False) -> torch.Tensor:
+        hidden_states = src
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                attention_mask=src_mask,
+                position_embeddings=pos_embed,
+                output_attentions=output_attentions,
+            )
+        return hidden_states
+
+
+class DFineHybridEncoder(nn.Module):
+    """
+    Decoder consisting of a projection layer, a set of `DFineEncoder`, a top-down Feature Pyramid Network
+    (FPN) and a bottom-up Path Aggregation Network (PAN). More details on the paper: https://huggingface.co/papers/2304.08069
+
+    Args:
+        config: DFineConfig
+    """
+
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.config = config
+        self.in_channels = config.encoder_in_channels
+        self.num_fpn_stages = len(self.in_channels) - 1
+        self.feat_strides = config.feat_strides
+        self.encoder_hidden_dim = config.encoder_hidden_dim
+        self.encode_proj_layers = config.encode_proj_layers
+        self.positional_encoding_temperature = config.positional_encoding_temperature
+        self.eval_size = config.eval_size
+        self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
+        self.out_strides = self.feat_strides
+
+        # encoder transformer
+        self.encoder = nn.ModuleList([DFineEncoder(config) for _ in range(len(self.encode_proj_layers))])
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(len(self.in_channels) - 1, 0, -1):
+            lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1)
+            self.lateral_convs.append(lateral_layer)
+            num_blocks = round(3 * config.depth_mult)
+            fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks)
+            self.fpn_blocks.append(fpn_layer)
+
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(len(self.in_channels) - 1):
+            self.downsample_convs.append(DFineSCDown(config, 3, 2))
+            num_blocks = round(3 * config.depth_mult)
+            self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks))
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(
+        width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+    ):
+        grid_w = torch.arange(torch_int(width), device=device).to(dtype)
+        grid_h = torch.arange(torch_int(height), device=device).to(dtype)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+        if embed_dim % 4 != 0:
+            raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, device=device).to(dtype) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # encoder
+        if self.config.encoder_layers > 0:
+            for i, enc_ind in enumerate(self.encode_proj_layers):
+                if output_hidden_states:
+                    encoder_states = encoder_states + (hidden_states[enc_ind],)
+                height, width = hidden_states[enc_ind].shape[2:]
+                # flatten [batch, channel, height, width] to [batch, height*width, channel]
+                src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        width,
+                        height,
+                        self.encoder_hidden_dim,
+                        self.positional_encoding_temperature,
+                        device=src_flatten.device,
+                        dtype=src_flatten.dtype,
+                    )
+                else:
+                    pos_embed = None
+
+                layer_outputs = self.encoder[i](
+                    src_flatten,
+                    pos_embed=pos_embed,
+                    output_attentions=output_attentions,
+                )
+                hidden_states[enc_ind] = (
+                    layer_outputs[0].permute(0, 2, 1).reshape(-1, self.encoder_hidden_dim, height, width).contiguous()
+                )
+
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states[enc_ind],)
+
+        # top-down FPN
+        fpn_feature_maps = [hidden_states[-1]]
+        for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)):
+            backbone_feature_map = hidden_states[self.num_fpn_stages - idx - 1]
+            top_fpn_feature_map = fpn_feature_maps[-1]
+            # apply lateral block
+            top_fpn_feature_map = lateral_conv(top_fpn_feature_map)
+            fpn_feature_maps[-1] = top_fpn_feature_map
+            # apply fpn block
+            top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest")
+            fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1)
+            new_fpn_feature_map = fpn_block(fused_feature_map)
+            fpn_feature_maps.append(new_fpn_feature_map)
+
+        fpn_feature_maps = fpn_feature_maps[::-1]
+
+        # bottom-up PAN
+        pan_feature_maps = [fpn_feature_maps[0]]
+        for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)):
+            top_pan_feature_map = pan_feature_maps[-1]
+            fpn_feature_map = fpn_feature_maps[idx + 1]
+            downsampled_feature_map = downsample_conv(top_pan_feature_map)
+            fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1)
+            new_pan_feature_map = pan_block(fused_feature_map)
+            pan_feature_maps.append(new_pan_feature_map)
+
+        if not return_dict:
+            return tuple(v for v in [pan_feature_maps, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=pan_feature_maps, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+__all__ = ["DFineModel", "DFinePreTrainedModel", "DFineForObjectDetection"]
diff --git a/phivenv/Lib/site-packages/transformers/models/d_fine/modular_d_fine.py b/phivenv/Lib/site-packages/transformers/models/d_fine/modular_d_fine.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ac7fef7b0d9a6b414bbf3c72173ac93520a29b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/d_fine/modular_d_fine.py
@@ -0,0 +1,1221 @@
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch import nn
+
+from ...activations import ACT2CLS
+from ...configuration_utils import PretrainedConfig
+from ...image_transforms import corners_to_center_format
+from ...utils import is_torchdynamo_compiling, logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+from ..rt_detr.modeling_rt_detr import (
+    RTDetrConvNormLayer,
+    RTDetrDecoder,
+    RTDetrDecoderLayer,
+    RTDetrDecoderOutput,
+    RTDetrEncoder,
+    RTDetrForObjectDetection,
+    RTDetrHybridEncoder,
+    RTDetrMLPPredictionHead,
+    RTDetrModel,
+    RTDetrPreTrainedModel,
+    RTDetrRepVggBlock,
+    inverse_sigmoid,
+)
+from ..rt_detr_v2.modeling_rt_detr_v2 import multi_scale_deformable_attention_v2
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO: Attribute map assignment logic should be fixed in modular
+# as well as super() call parsing because otherwise we cannot re-write args after initialization
+class DFineConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`DFineModel`]. It is used to instantiate a D-FINE
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of D-FINE-X-COCO "[ustc-community/dfine-xlarge-coco"](https://huggingface.co/ustc-community/dfine-xlarge-coco").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_bias_prior_prob (`float`, *optional*):
+            The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
+            If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
+            Whether to freeze the batch normalization layers in the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        encoder_hidden_dim (`int`, *optional*, defaults to 256):
+            Dimension of the layers in hybrid encoder.
+        encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
+            Multi level features input for encoder.
+        feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`):
+            Strides used in each feature map.
+        encoder_layers (`int`, *optional*, defaults to 1):
+            Total of layers to be used by the encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The ratio for all dropout layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`):
+            Indexes of the projected layers to be used in the encoder.
+        positional_encoding_temperature (`int`, *optional*, defaults to 10000):
+            The temperature parameter used to create the positional encodings.
+        encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        activation_function (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the general layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        eval_size (`tuple[int, int]`, *optional*):
+            Height and width used to computes the effective height and width of the position embeddings after taking
+            into account the stride.
+        normalize_before (`bool`, *optional*, defaults to `False`):
+            Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
+            feed-forward modules.
+        hidden_expansion (`float`, *optional*, defaults to 1.0):
+            Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers exclude hybrid encoder.
+        num_queries (`int`, *optional*, defaults to 300):
+            Number of object queries.
+        decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
+            Multi level features dimension for decoder
+        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        num_feature_levels (`int`, *optional*, defaults to 3):
+            The number of input feature levels.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_denoising (`int`, *optional*, defaults to 100):
+            The total number of denoising tasks or queries to be used for contrastive denoising.
+        label_noise_ratio (`float`, *optional*, defaults to 0.5):
+            The fraction of denoising labels to which random noise should be added.
+        box_noise_scale (`float`, *optional*, defaults to 1.0):
+            Scale or magnitude of noise to be added to the bounding boxes.
+        learn_initial_query (`bool`, *optional*, defaults to `False`):
+            Indicates whether the initial query embeddings for the decoder should be learned during training
+        anchor_image_size (`tuple[int, int]`, *optional*):
+            Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied.
+        with_box_refine (`bool`, *optional*, defaults to `True`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the architecture has an encoder decoder structure.
+        matcher_alpha (`float`, *optional*, defaults to 0.25):
+            Parameter alpha used by the Hungarian Matcher.
+        matcher_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used by the Hungarian Matcher.
+        matcher_class_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the class loss used by the Hungarian Matcher.
+        matcher_bbox_cost (`float`, *optional*, defaults to 5.0):
+            The relative weight of the bounding box loss used by the Hungarian Matcher.
+        matcher_giou_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the giou loss of used by the Hungarian Matcher.
+        use_focal_loss (`bool`, *optional*, defaults to `True`):
+            Parameter informing if focal focal should be used.
+        auxiliary_loss (`bool`, *optional*, defaults to `True`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        focal_loss_alpha (`float`, *optional*, defaults to 0.75):
+            Parameter alpha used to compute the focal loss.
+        focal_loss_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used to compute the focal loss.
+        weight_loss_vfl (`float`, *optional*, defaults to 1.0):
+            Relative weight of the varifocal loss in the object detection loss.
+        weight_loss_bbox (`float`, *optional*, defaults to 5.0):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        weight_loss_giou (`float`, *optional*, defaults to 2.0):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        weight_loss_fgl (`float`, *optional*, defaults to 0.15):
+            Relative weight of the fine-grained localization loss in the object detection loss.
+        weight_loss_ddf (`float`, *optional*, defaults to 1.5):
+            Relative weight of the decoupled distillation focal loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.0001):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        eval_idx (`int`, *optional*, defaults to -1):
+            Index of the decoder layer to use for evaluation. If negative, counts from the end
+            (e.g., -1 means use the last layer). This allows for early prediction in the decoder
+            stack while still training later layers.
+        layer_scale (`float`, *optional*, defaults to `1.0`):
+            Scaling factor for the hidden dimension in later decoder layers. Used to adjust the
+            model capacity after the evaluation layer.
+        max_num_bins (`int`, *optional*, defaults to 32):
+            Maximum number of bins for the distribution-guided bounding box refinement.
+            Higher values allow for more fine-grained localization but increase computation.
+        reg_scale (`float`, *optional*, defaults to 4.0):
+            Scale factor for the regression distribution. Controls the range and granularity
+            of the bounding box refinement process.
+        depth_mult (`float`, *optional*, defaults to 1.0):
+            Multiplier for the number of blocks in RepNCSPELAN4 layers. Used to scale the model's
+            depth while maintaining its architecture.
+        top_prob_values (`int`, *optional*, defaults to 4):
+            Number of top probability values to consider from each corner's distribution.
+        lqe_hidden_dim (`int`, *optional*, defaults to 64):
+            Hidden dimension size for the Location Quality Estimator (LQE) network.
+        lqe_layers (`int`, *optional*, defaults to 2):
+            Number of layers in the Location Quality Estimator MLP.
+        decoder_offset_scale (`float`, *optional*, defaults to 0.5):
+            Offset scale used in deformable attention.
+        decoder_method (`str`, *optional*, defaults to `"default"`):
+            The method to use for the decoder: `"default"` or `"discrete"`.
+        up (`float`, *optional*, defaults to 0.5):
+            Controls the upper bounds of the Weighting Function.
+    """
+
+    model_type = "d_fine"
+    layer_types = ["basic", "bottleneck"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        initializer_range=0.01,
+        initializer_bias_prior_prob=None,
+        layer_norm_eps=1e-5,
+        batch_norm_eps=1e-5,
+        # backbone
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        freeze_backbone_batch_norms=True,
+        backbone_kwargs=None,
+        # encoder HybridEncoder
+        encoder_hidden_dim=256,
+        encoder_in_channels=[512, 1024, 2048],
+        feat_strides=[8, 16, 32],
+        encoder_layers=1,
+        encoder_ffn_dim=1024,
+        encoder_attention_heads=8,
+        dropout=0.0,
+        activation_dropout=0.0,
+        encode_proj_layers=[2],
+        positional_encoding_temperature=10000,
+        encoder_activation_function="gelu",
+        activation_function="silu",
+        eval_size=None,
+        normalize_before=False,
+        hidden_expansion=1.0,
+        # decoder DFineTransformer
+        d_model=256,
+        num_queries=300,
+        decoder_in_channels=[256, 256, 256],
+        decoder_ffn_dim=1024,
+        num_feature_levels=3,
+        decoder_n_points=4,
+        decoder_layers=6,
+        decoder_attention_heads=8,
+        decoder_activation_function="relu",
+        attention_dropout=0.0,
+        num_denoising=100,
+        label_noise_ratio=0.5,
+        box_noise_scale=1.0,
+        learn_initial_query=False,
+        anchor_image_size=None,
+        with_box_refine=True,
+        is_encoder_decoder=True,
+        # Loss
+        matcher_alpha=0.25,
+        matcher_gamma=2.0,
+        matcher_class_cost=2.0,
+        matcher_bbox_cost=5.0,
+        matcher_giou_cost=2.0,
+        use_focal_loss=True,
+        auxiliary_loss=True,
+        focal_loss_alpha=0.75,
+        focal_loss_gamma=2.0,
+        weight_loss_vfl=1.0,
+        weight_loss_bbox=5.0,
+        weight_loss_giou=2.0,
+        weight_loss_fgl=0.15,
+        weight_loss_ddf=1.5,
+        eos_coefficient=1e-4,
+        eval_idx=-1,
+        layer_scale=1,
+        max_num_bins=32,
+        reg_scale=4.0,
+        depth_mult=1.0,
+        top_prob_values=4,
+        lqe_hidden_dim=64,
+        lqe_layers=2,
+        decoder_offset_scale=0.5,
+        decoder_method="default",
+        up=0.5,
+        **kwargs,
+    ):
+        self.initializer_range = initializer_range
+        self.initializer_bias_prior_prob = initializer_bias_prior_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.batch_norm_eps = batch_norm_eps
+        # backbone
+        if backbone_config is None and backbone is None:
+            logger.info(
+                "`backbone_config` and `backbone` are `None`. Initializing the config with the default `HGNet-V2` backbone."
+            )
+            backbone_model_type = "hgnet_v2"
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            # this will map it to RTDetrResNetConfig
+            # note: we can instead create HGNetV2Config
+            # and we would need to create HGNetV2Backbone
+            backbone_config = config_class(
+                num_channels=3,
+                embedding_size=64,
+                hidden_sizes=[256, 512, 1024, 2048],
+                depths=[3, 4, 6, 3],
+                layer_type="bottleneck",
+                hidden_act="relu",
+                downsample_in_first_stage=False,
+                downsample_in_bottleneck=False,
+                out_features=None,
+                out_indices=[2, 3, 4],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
+        self.backbone_kwargs = backbone_kwargs
+        # encoder
+        self.encoder_hidden_dim = encoder_hidden_dim
+        self.encoder_in_channels = encoder_in_channels
+        self.feat_strides = feat_strides
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.encode_proj_layers = encode_proj_layers
+        self.encoder_layers = encoder_layers
+        self.positional_encoding_temperature = positional_encoding_temperature
+        self.eval_size = eval_size
+        self.normalize_before = normalize_before
+        self.encoder_activation_function = encoder_activation_function
+        self.activation_function = activation_function
+        self.hidden_expansion = hidden_expansion
+        # decoder
+        self.d_model = d_model
+        self.num_queries = num_queries
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_in_channels = decoder_in_channels
+        self.num_feature_levels = num_feature_levels
+        self.decoder_n_points = decoder_n_points
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_activation_function = decoder_activation_function
+        self.attention_dropout = attention_dropout
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        self.learn_initial_query = learn_initial_query
+        self.anchor_image_size = anchor_image_size
+        self.auxiliary_loss = auxiliary_loss
+        self.with_box_refine = with_box_refine
+        # Loss
+        self.matcher_alpha = matcher_alpha
+        self.matcher_gamma = matcher_gamma
+        self.matcher_class_cost = matcher_class_cost
+        self.matcher_bbox_cost = matcher_bbox_cost
+        self.matcher_giou_cost = matcher_giou_cost
+        self.use_focal_loss = use_focal_loss
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        self.weight_loss_vfl = weight_loss_vfl
+        self.weight_loss_bbox = weight_loss_bbox
+        self.weight_loss_giou = weight_loss_giou
+        self.weight_loss_fgl = weight_loss_fgl
+        self.weight_loss_ddf = weight_loss_ddf
+        self.eos_coefficient = eos_coefficient
+        # add the new attributes with the given values or defaults
+        self.eval_idx = eval_idx
+        self.layer_scale = layer_scale
+        self.max_num_bins = max_num_bins
+        self.reg_scale = reg_scale
+        self.depth_mult = depth_mult
+        self.decoder_offset_scale = decoder_offset_scale
+        self.decoder_method = decoder_method
+        self.top_prob_values = top_prob_values
+        self.lqe_hidden_dim = lqe_hidden_dim
+        self.lqe_layers = lqe_layers
+        self.up = up
+
+        if isinstance(self.decoder_n_points, list):
+            if len(self.decoder_n_points) != self.num_feature_levels:
+                raise ValueError(
+                    f"Length of decoder_n_points list ({len(self.decoder_n_points)}) must match num_feature_levels ({self.num_feature_levels})."
+                )
+
+        head_dim = self.d_model // self.decoder_attention_heads
+        if head_dim * self.decoder_attention_heads != self.d_model:
+            raise ValueError(
+                f"Embedded dimension {self.d_model} must be divisible by decoder_attention_heads {self.decoder_attention_heads}"
+            )
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+    @classmethod
+    def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`DFineConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model
+        configuration.
+
+            Args:
+                backbone_config ([`PretrainedConfig`]):
+                    The backbone configuration.
+
+            Returns:
+                [`DFineConfig`]: An instance of a configuration object
+        """
+        return cls(
+            backbone_config=backbone_config,
+            **kwargs,
+        )
+
+
+class DFineMultiscaleDeformableAttention(nn.Module):
+    def __init__(self, config: DFineConfig):
+        """
+        D-Fine version of multiscale deformable attention
+        """
+        super().__init__()
+        self.d_model = config.d_model
+        self.n_heads = config.decoder_attention_heads
+        self.n_levels = config.num_feature_levels
+        self.offset_scale = config.decoder_offset_scale
+        self.decoder_method = config.decoder_method
+        self.n_points = config.decoder_n_points
+
+        if isinstance(self.n_points, list):
+            num_points_list = self.n_points
+        else:
+            num_points_list = [self.n_points for _ in range(self.n_levels)]
+
+        self.num_points_list = num_points_list
+        num_points_scale = [1 / n for n in self.num_points_list for _ in range(n)]
+        self.register_buffer("num_points_scale", torch.tensor(num_points_scale, dtype=torch.float32))
+
+        self.total_points = self.n_heads * sum(self.num_points_list)
+
+        self.sampling_offsets = nn.Linear(self.d_model, self.total_points * 2)
+        self.attention_weights = nn.Linear(self.d_model, self.total_points)
+
+        self.ms_deformable_attn_core = multi_scale_deformable_attention_v2
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        reference_points=None,
+        encoder_hidden_states=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+
+        if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        # Reshape for multi-head attention
+        value = encoder_hidden_states.reshape(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        if attention_mask is not None:
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+
+        sampling_offsets: torch.Tensor = self.sampling_offsets(hidden_states)
+        sampling_offsets = sampling_offsets.reshape(
+            batch_size, num_queries, self.n_heads, sum(self.num_points_list), 2
+        )
+
+        attention_weights = self.attention_weights(hidden_states).reshape(
+            batch_size, num_queries, self.n_heads, sum(self.num_points_list)
+        )
+        attention_weights = F.softmax(attention_weights, dim=-1)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.n_levels, 1, 2)
+            sampling_locations = (
+                reference_points.reshape(batch_size, sequence_length, 1, self.n_levels, 1, 2)
+                + sampling_offsets / offset_normalizer
+            )
+        elif reference_points.shape[-1] == 4:
+            # reference_points [8, 480, None, 1,  4]
+            # sampling_offsets [8, 480, 8,    12, 2]
+            num_points_scale = self.num_points_scale.to(dtype=hidden_states.dtype).unsqueeze(-1)
+            offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale
+            sampling_locations = reference_points[:, :, None, :, :2] + offset
+        else:
+            raise ValueError(
+                f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead."
+            )
+
+        output = self.ms_deformable_attn_core(
+            value,
+            spatial_shapes_list,
+            sampling_locations,
+            attention_weights,
+            self.num_points_list,
+            self.decoder_method,
+        )
+
+        return output, attention_weights
+
+
+class DFineGate(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.gate = nn.Linear(2 * d_model, 2 * d_model)
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_input = torch.cat([second_residual, hidden_states], dim=-1)
+        gates = torch.sigmoid(self.gate(gate_input))
+        gate1, gate2 = gates.chunk(2, dim=-1)
+        hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states)
+        return hidden_states
+
+
+class DFineDecoderLayer(RTDetrDecoderLayer):
+    def __init__(self, config: DFineConfig):
+        super().__init__(config)
+
+        # override the encoder attention module with d-fine version
+        self.encoder_attn = DFineMultiscaleDeformableAttention(config=config)
+        # gate
+        self.gateway = DFineGate(config.d_model)
+
+        del self.encoder_attn_layer_norm
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Any, Any]:
+        # Self Attention
+        hidden_states_2, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + hidden_states_2
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings
+        hidden_states_2, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+        )
+
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
+        hidden_states = self.gateway(residual, hidden_states_2)
+
+        # Fully Connected
+        hidden_states_2 = self.activation_fn(self.fc1(hidden_states))
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.activation_dropout, training=self.training)
+        hidden_states_2 = self.fc2(hidden_states_2)
+        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + hidden_states_2
+        hidden_states = self.final_layer_norm(hidden_states.clamp(min=-65504, max=65504))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class DFinePreTrainedModel(RTDetrPreTrainedModel):
+    def _init_weights(self, module):
+        # initialize linear layer bias value according to a given probability value.
+        if isinstance(module, (DFineForObjectDetection, DFineDecoder)):
+            if module.class_embed is not None:
+                for layer in module.class_embed:
+                    prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+                    bias = float(-math.log((1 - prior_prob) / prior_prob))
+                    nn.init.xavier_uniform_(layer.weight)
+                    nn.init.constant_(layer.bias, bias)
+
+            if module.bbox_embed is not None:
+                for layer in module.bbox_embed:
+                    nn.init.constant_(layer.layers[-1].weight, 0)
+                    nn.init.constant_(layer.layers[-1].bias, 0)
+
+        if isinstance(module, DFineMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values
+            grid_init = grid_init.reshape(module.n_heads, 1, 2).tile([1, sum(module.num_points_list), 1])
+            scaling = torch.concat([torch.arange(1, n + 1) for n in module.num_points_list]).reshape(1, -1, 1)
+            grid_init *= scaling
+            with torch.no_grad():
+                module.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+
+        if isinstance(module, DFineModel):
+            prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+            bias = float(-math.log((1 - prior_prob) / prior_prob))
+            nn.init.xavier_uniform_(module.enc_score_head.weight)
+            nn.init.constant_(module.enc_score_head.bias, bias)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+        if isinstance(module, DFineGate):
+            bias = float(-math.log((1 - 0.5) / 0.5))
+            init.constant_(module.gate.bias, bias)
+            init.constant_(module.gate.weight, 0)
+
+        if isinstance(module, DFineLQE):
+            init.constant_(module.reg_conf.layers[-1].bias, 0)
+            init.constant_(module.reg_conf.layers[-1].weight, 0)
+
+        if hasattr(module, "weight_embedding") and self.config.learn_initial_query:
+            nn.init.xavier_uniform_(module.weight_embedding.weight)
+        if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:
+            nn.init.xavier_uniform_(module.denoising_class_embed.weight)
+
+
+class DFineIntegral(nn.Module):
+    """
+    A static layer that calculates integral results from a distribution.
+
+    This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
+    where Pr(n) is the softmax probability vector representing the discrete
+    distribution, and W(n) is the non-uniform Weighting Function.
+
+    Args:
+        max_num_bins (int): Max number of the discrete bins. Default is 32.
+                       It can be adjusted based on the dataset or task requirements.
+    """
+
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.max_num_bins = config.max_num_bins
+
+    def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor:
+        batch_size, num_queries, _ = pred_corners.shape
+        pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1)
+        pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4)
+        pred_corners = pred_corners.reshape(batch_size, num_queries, -1)
+        return pred_corners
+
+
+class DFineDecoderOutput(RTDetrDecoderOutput):
+    pass
+
+
+class DFineDecoder(RTDetrDecoder):
+    """
+    D-FINE Decoder implementing Fine-grained Distribution Refinement (FDR).
+
+    This decoder refines object detection predictions through iterative updates across multiple layers,
+    utilizing attention mechanisms, location quality estimators, and distribution refinement techniques
+    to improve bounding box accuracy and robustness.
+    """
+
+    def __init__(self, config: DFineConfig):
+        self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx
+        super().__init__(config=config)
+        self.reg_scale = nn.Parameter(torch.tensor([config.reg_scale]), requires_grad=False)
+        self.max_num_bins = config.max_num_bins
+        self.d_model = config.d_model
+        self.layer_scale = config.layer_scale
+        self.pre_bbox_head = DFineMLP(config.hidden_size, config.hidden_size, 4, 3)
+        self.integral = DFineIntegral(config)
+        self.num_head = config.decoder_attention_heads
+        self.up = nn.Parameter(torch.tensor([config.up]), requires_grad=False)
+        self.lqe_layers = nn.ModuleList([DFineLQE(config) for _ in range(config.decoder_layers)])
+        self.layers = nn.ModuleList(
+            [DFineDecoderLayer(config) for _ in range(config.decoder_layers)]
+            + [DFineDecoderLayer(config) for _ in range(config.decoder_layers - self.eval_idx - 1)]
+        )
+
+    def forward(
+        self,
+        encoder_hidden_states: torch.Tensor,
+        reference_points: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        spatial_shapes,
+        level_start_index=None,
+        spatial_shapes_list=None,
+        output_hidden_states=None,
+        encoder_attention_mask=None,
+        memory_mask=None,
+        output_attentions=None,
+        return_dict=None,
+    ) -> DFineDecoderOutput:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+        intermediate_logits = ()
+        intermediate_predicted_corners = ()
+        initial_reference_points = ()
+
+        output_detach = pred_corners_undetach = 0
+
+        project = weighting_function(self.max_num_bins, self.up, self.reg_scale)
+        ref_points_detach = F.sigmoid(reference_points)
+
+        for i, decoder_layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = self.query_pos_head(ref_points_detach).clamp(min=-10, max=10)
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            output = decoder_layer(
+                hidden_states=hidden_states,
+                position_embeddings=query_pos_embed,
+                reference_points=ref_points_input,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = output[0]
+
+            if i == 0:
+                # Initial bounding box predictions with inverse sigmoid refinement
+                new_reference_points = F.sigmoid(self.pre_bbox_head(output[0]) + inverse_sigmoid(ref_points_detach))
+                ref_points_initial = new_reference_points.detach()
+
+            # Refine bounding box corners using FDR, integrating previous layer's corrections
+            if self.bbox_embed is not None:
+                pred_corners = self.bbox_embed[i](hidden_states + output_detach) + pred_corners_undetach
+                inter_ref_bbox = distance2bbox(
+                    ref_points_initial, self.integral(pred_corners, project), self.reg_scale
+                )
+                pred_corners_undetach = pred_corners
+                ref_points_detach = inter_ref_bbox.detach()
+
+            output_detach = hidden_states.detach()
+
+            intermediate += (hidden_states,)
+
+            if self.class_embed is not None and (self.training or i == self.eval_idx):
+                scores = self.class_embed[i](hidden_states)
+                # Add initial logits and reference points with pre-bbox head
+                if i == 0:
+                    intermediate_logits += (scores,)
+                    intermediate_reference_points += (new_reference_points,)
+                # Lqe does not affect the performance here.
+                scores = self.lqe_layers[i](scores, pred_corners)
+                intermediate_logits += (scores,)
+                intermediate_reference_points += (inter_ref_bbox,)
+                initial_reference_points += (ref_points_initial,)
+                intermediate_predicted_corners += (pred_corners,)
+
+            if output_attentions:
+                all_self_attns += (output[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (output[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate)
+        if self.class_embed is not None and self.bbox_embed is not None:
+            intermediate_logits = torch.stack(intermediate_logits, dim=1)
+            intermediate_predicted_corners = torch.stack(intermediate_predicted_corners, dim=1)
+            initial_reference_points = torch.stack(initial_reference_points, dim=1)
+            intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_logits,
+                    intermediate_reference_points,
+                    intermediate_predicted_corners,
+                    initial_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+
+        return DFineDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_logits=intermediate_logits,
+            intermediate_reference_points=intermediate_reference_points,
+            intermediate_predicted_corners=intermediate_predicted_corners,
+            initial_reference_points=initial_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class DFineModel(RTDetrModel):
+    def __init__(self, config: DFineConfig):
+        super().__init__(config)
+        del self.decoder_input_proj
+        self.encoder = DFineHybridEncoder(config=config)
+        num_backbone_outs = len(config.decoder_in_channels)
+        decoder_input_proj = []
+        in_channels = config.decoder_in_channels[-1]
+        for _ in range(num_backbone_outs):
+            if config.hidden_size == config.decoder_in_channels[-1]:
+                decoder_input_proj.append(nn.Identity())
+            else:
+                conv = nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False)
+                batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps)
+                decoder_input_proj.append(nn.Sequential(conv, batchnorm))
+        for _ in range(config.num_feature_levels - num_backbone_outs):
+            if config.hidden_size == config.decoder_in_channels[-1]:
+                decoder_input_proj.append(nn.Identity())
+            else:
+                conv = nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False)
+                batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps)
+                decoder_input_proj.append(nn.Sequential(conv, batchnorm))
+        self.decoder_input_proj = nn.ModuleList(decoder_input_proj)
+        self.decoder = DFineDecoder(config)
+
+
+class DFineForObjectDetection(RTDetrForObjectDetection, DFinePreTrainedModel):
+    def __init__(self, config: DFineConfig):
+        DFinePreTrainedModel.__init__(self, config)
+
+        # D-FINE encoder-decoder model
+        self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx
+        self.model = DFineModel(config)
+        scaled_dim = round(config.layer_scale * config.hidden_size)
+        num_pred = config.decoder_layers
+        self.class_embed = nn.ModuleList([nn.Linear(config.d_model, config.num_labels) for _ in range(num_pred)])
+        self.bbox_embed = nn.ModuleList(
+            [
+                DFineMLP(config.hidden_size, config.hidden_size, 4 * (config.max_num_bins + 1), 3)
+                for _ in range(self.eval_idx + 1)
+            ]
+            + [
+                DFineMLP(scaled_dim, scaled_dim, 4 * (config.max_num_bins + 1), 3)
+                for _ in range(config.decoder_layers - self.eval_idx - 1)
+            ]
+        )
+
+        # here self.model.decoder.bbox_embed is null, but not self.bbox_embed
+        self.model.decoder.class_embed = self.class_embed
+        self.model.decoder.bbox_embed = self.bbox_embed
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(**super_kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers.image_utils import load_image
+        >>> from transformers import AutoImageProcessor, DFineForObjectDetection
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = load_image(url)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-xlarge-coco")
+        >>> model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-xlarge-coco")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+        >>> list(logits.shape)
+        [1, 300, 80]
+
+        >>> boxes = outputs.pred_boxes
+        >>> list(boxes.shape)
+        [1, 300, 4]
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)
+        >>> result = results[0]  # first image in batch
+
+        >>> for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.958 at location [344.49, 23.4, 639.84, 374.27]
+        Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33]
+        Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57]
+        Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74]
+        ```
+        """
+        super().forward(**super_kwargs)
+
+
+def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor:
+    """
+    Generates the non-uniform Weighting Function W(n) for bounding box regression.
+
+    Args:
+        max_num_bins (int): Max number of the discrete bins.
+        up (Tensor): Controls upper bounds of the sequence,
+                     where maximum offset is ±up * H / W.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+                           Larger values result in flatter weights near the central axis W(max_num_bins/2)=0
+                           and steeper weights at both ends.
+    Returns:
+        Tensor: Sequence of Weighting Function.
+    """
+    upper_bound1 = abs(up[0]) * abs(reg_scale)
+    upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
+    step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2))
+    left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)]
+    right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)]
+    values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
+    values = torch.cat(values, 0)
+    return values
+
+
+class DFineMLPPredictionHead(RTDetrMLPPredictionHead):
+    pass
+
+
+def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor:
+    """
+    Decodes edge-distances into bounding box coordinates.
+
+    Args:
+        points (`torch.Tensor`):
+            (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
+        distance (`torch.Tensor`):
+            (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries.
+        reg_scale (`float`):
+            Controls the curvature of the Weighting Function.
+    Returns:
+        `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
+    """
+    reg_scale = abs(reg_scale)
+    top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
+    top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
+    bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
+    bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
+
+    bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1)
+
+    return corners_to_center_format(bboxes)
+
+
+class DFineMLP(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"):
+        super().__init__()
+        self.num_layers = num_layers
+        hidden_dims = [hidden_dim] * (num_layers - 1)
+        input_dims = [input_dim] + hidden_dims
+        output_dims = hidden_dims + [output_dim]
+        self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims))
+        self.act = ACT2CLS[act]()
+
+    def forward(self, stat_features: torch.Tensor) -> torch.Tensor:
+        for i, layer in enumerate(self.layers):
+            stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features)
+        return stat_features
+
+
+class DFineLQE(nn.Module):
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.top_prob_values = config.top_prob_values
+        self.max_num_bins = config.max_num_bins
+        self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers)
+
+    def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor:
+        batch_size, length, _ = pred_corners.size()
+        prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1)
+        prob_topk, _ = prob.topk(self.top_prob_values, dim=-1)
+        stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
+        quality_score = self.reg_conf(stat.reshape(batch_size, length, -1))
+        scores = scores + quality_score
+        return scores
+
+
+class DFineConvNormLayer(RTDetrConvNormLayer):
+    def __init__(
+        self,
+        config: DFineConfig,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        groups: int = 1,
+        padding: Optional[int] = None,
+        activation: Optional[str] = None,
+    ):
+        super().__init__(config, in_channels, out_channels, kernel_size, stride, padding=None, activation=activation)
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups=groups,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+
+
+class DFineRepVggBlock(RTDetrRepVggBlock):
+    def __init__(self, config: DFineConfig, in_channels: int, out_channels: int):
+        super().__init__(config)
+        hidden_channels = in_channels
+        self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1)
+        self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0)
+
+
+class DFineCSPRepLayer(nn.Module):
+    """
+    Cross Stage Partial (CSP) network layer with RepVGG blocks.
+    """
+
+    def __init__(
+        self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0
+    ):
+        super().__init__()
+        in_channels = in_channels
+        out_channels = out_channels
+        activation = config.activation_function
+
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.bottlenecks = nn.ModuleList(
+            [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)]
+        )
+        if hidden_channels != out_channels:
+            self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state_1 = self.conv1(hidden_state)
+        for bottleneck in self.bottlenecks:
+            hidden_state_1 = bottleneck(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
+        hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2)
+        return hidden_state_3
+
+
+class DFineRepNCSPELAN4(nn.Module):
+    def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3):
+        super().__init__()
+        conv1_dim = config.encoder_hidden_dim * 2
+        conv2_dim = config.encoder_hidden_dim
+        conv3_dim = config.encoder_hidden_dim * 2
+        conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2)
+        self.conv_dim = conv3_dim // 2
+        self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act)
+        self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks)
+        self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
+        self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks)
+        self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
+        self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act)
+
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        # Split initial features into two branches after first convolution
+        split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1))
+
+        # Process branches sequentially
+        branch1 = self.csp_rep1(split_features[-1])
+        branch1 = self.conv2(branch1)
+        branch2 = self.csp_rep2(branch1)
+        branch2 = self.conv3(branch2)
+
+        split_features.extend([branch1, branch2])
+        merged_features = torch.cat(split_features, 1)
+        merged_features = self.conv4(merged_features)
+        return merged_features
+
+
+class DFineSCDown(nn.Module):
+    def __init__(self, config: DFineConfig, kernel_size: int, stride: int):
+        super().__init__()
+        self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1)
+        self.conv2 = DFineConvNormLayer(
+            config,
+            config.encoder_hidden_dim,
+            config.encoder_hidden_dim,
+            kernel_size,
+            stride,
+            config.encoder_hidden_dim,
+        )
+
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        input_features = self.conv1(input_features)
+        input_features = self.conv2(input_features)
+        return input_features
+
+
+class DFineEncoder(RTDetrEncoder):
+    pass
+
+
+class DFineHybridEncoder(RTDetrHybridEncoder):
+    def __init__(self, config: DFineConfig):
+        nn.Module.__init__(self)
+        self.config = config
+        self.in_channels = config.encoder_in_channels
+        self.num_fpn_stages = len(self.in_channels) - 1
+        self.feat_strides = config.feat_strides
+        self.encoder_hidden_dim = config.encoder_hidden_dim
+        self.encode_proj_layers = config.encode_proj_layers
+        self.positional_encoding_temperature = config.positional_encoding_temperature
+        self.eval_size = config.eval_size
+        self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
+        self.out_strides = self.feat_strides
+
+        # encoder transformer
+        self.encoder = nn.ModuleList([DFineEncoder(config) for _ in range(len(self.encode_proj_layers))])
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(len(self.in_channels) - 1, 0, -1):
+            lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1)
+            self.lateral_convs.append(lateral_layer)
+            num_blocks = round(3 * config.depth_mult)
+            fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks)
+            self.fpn_blocks.append(fpn_layer)
+
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(len(self.in_channels) - 1):
+            self.downsample_convs.append(DFineSCDown(config, 3, 2))
+            num_blocks = round(3 * config.depth_mult)
+            self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks))
+
+
+__all__ = [
+    "DFineConfig",
+    "DFineModel",
+    "DFinePreTrainedModel",
+    "DFineForObjectDetection",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/__init__.py b/phivenv/Lib/site-packages/transformers/models/depth_pro/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5968aae67b523ef207b1ad9ed00199b412ee6cf5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/depth_pro/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_depth_pro import *
+    from .image_processing_depth_pro import *
+    from .image_processing_depth_pro_fast import *
+    from .modeling_depth_pro import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0defd1a07dbfa426f410fbd424fa9f1c6d8fbbe9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/configuration_depth_pro.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/configuration_depth_pro.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..005ce473a8fb1cc4a24e2832155514884245e606
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/configuration_depth_pro.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/image_processing_depth_pro.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/image_processing_depth_pro.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e7a82847d0c321f5cb220b32003f9ff714ef1d3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/image_processing_depth_pro.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/image_processing_depth_pro_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/image_processing_depth_pro_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06f0c3762bf3cbbd7fb17327438d3f00a38375a4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/image_processing_depth_pro_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/modeling_depth_pro.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/modeling_depth_pro.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94d5b7759b771636d4bcf3511723ddab46f314f1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/depth_pro/__pycache__/modeling_depth_pro.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/configuration_depth_pro.py b/phivenv/Lib/site-packages/transformers/models/depth_pro/configuration_depth_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc14a0e154f479097e5cf6869c7145374b3e5d9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/depth_pro/configuration_depth_pro.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DepthPro model configuration"""
+
+from copy import deepcopy
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DepthProConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a
+    DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DepthPro
+    [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
+        patch_size (`int`, *optional*, defaults to 384):
+            The size (resolution) of each patch. This is also the image_size for backbone model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        intermediate_hook_ids (`list[int]`, *optional*, defaults to `[11, 5]`):
+            Indices of the intermediate hidden states from the patch encoder to use for fusion.
+        intermediate_feature_dims (`list[int]`, *optional*, defaults to `[256, 256]`):
+            Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`.
+        scaled_images_ratios (`list[float]`, *optional*, defaults to `[0.25, 0.5, 1]`):
+            Ratios of scaled images to be used by the patch encoder.
+        scaled_images_overlap_ratios (`list[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`):
+            Overlap ratios between patches for each scaled image in `scaled_images_ratios`.
+        scaled_images_feature_dims (`list[int]`, *optional*, defaults to `[1024, 1024, 512]`):
+            Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
+        merge_padding_value (`int`, *optional*, defaults to 3):
+            When merging smaller patches back to the image size, overlapping sections of this size are removed.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the pre-activate residual units of the fusion blocks.
+        use_fov_model (`bool`, *optional*, defaults to `False`):
+            Whether to use `DepthProFovModel` to generate the field of view.
+        num_fov_head_layers (`int`, *optional*, defaults to 2):
+            Number of convolution layers in the head of `DepthProFovModel`.
+        image_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the image encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+        patch_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the patch encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+        fov_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the fov encoder model, which is loaded using the [`AutoModel`] API.
+            By default, Dinov2 model is used as backbone.
+
+    Example:
+
+    ```python
+    >>> from transformers import DepthProConfig, DepthProModel
+
+    >>> # Initializing a DepthPro apple/DepthPro style configuration
+    >>> configuration = DepthProConfig()
+
+    >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration
+    >>> model = DepthProModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "depth_pro"
+    sub_configs = {"image_model_config": AutoConfig, "patch_model_config": AutoConfig, "fov_model_config": AutoConfig}
+
+    def __init__(
+        self,
+        fusion_hidden_size=256,
+        patch_size=384,
+        initializer_range=0.02,
+        intermediate_hook_ids=[11, 5],
+        intermediate_feature_dims=[256, 256],
+        scaled_images_ratios=[0.25, 0.5, 1],
+        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
+        scaled_images_feature_dims=[1024, 1024, 512],
+        merge_padding_value=3,
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=True,
+        use_fov_model=False,
+        num_fov_head_layers=2,
+        image_model_config=None,
+        patch_model_config=None,
+        fov_model_config=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # scaled_images_ratios is sorted
+        if scaled_images_ratios != sorted(scaled_images_ratios):
+            raise ValueError(
+                f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high"
+            )
+
+        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
+        if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)):
+            raise ValueError(
+                f"len(scaled_images_ratios)={len(scaled_images_ratios)} and "
+                f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and "
+                f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # intermediate_hook_ids, intermediate_feature_dims should be consistent
+        if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)):
+            raise ValueError(
+                f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and "
+                f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, "
+                f"should match in config."
+            )
+
+        # fusion_hidden_size should be consistent with num_fov_head_layers
+        if fusion_hidden_size // 2**num_fov_head_layers == 0:
+            raise ValueError(
+                f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} "
+                "i.e fusion_hidden_size // 2**num_fov_head_layers > 0"
+            )
+
+        self.fusion_hidden_size = fusion_hidden_size
+        self.patch_size = patch_size
+        self.initializer_range = initializer_range
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
+        self.use_fov_model = use_fov_model
+        self.num_fov_head_layers = num_fov_head_layers
+        self.intermediate_hook_ids = intermediate_hook_ids
+        self.intermediate_feature_dims = intermediate_feature_dims
+        self.scaled_images_ratios = scaled_images_ratios
+        self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = scaled_images_feature_dims
+        self.merge_padding_value = merge_padding_value
+        self.image_model_config = image_model_config
+        self.patch_model_config = patch_model_config
+        self.fov_model_config = fov_model_config
+
+        for sub_config_key in self.sub_configs:
+            sub_config = getattr(self, sub_config_key)
+
+            if sub_config is None:
+                sub_config = CONFIG_MAPPING["dinov2"](image_size=patch_size)
+                logger.info(
+                    f"`{sub_config_key}` is `None`. Initializing `{sub_config_key}` with the `Dinov2Config` "
+                    f"with default values except `{sub_config_key}.image_size` is set to `config.patch_size`."
+                )
+            elif isinstance(sub_config, dict):
+                sub_config = deepcopy(sub_config)
+                if "model_type" not in sub_config:
+                    raise KeyError(
+                        f"The `model_type` key is missing in the `{sub_config_key}` dictionary. Please provide the model type."
+                    )
+                elif sub_config["model_type"] not in CONFIG_MAPPING:
+                    raise ValueError(
+                        f"The model type `{sub_config['model_type']}` in `{sub_config_key}` is not supported. Please provide a valid model type."
+                    )
+                image_size = sub_config.get("image_size")
+                if image_size != patch_size:
+                    logger.info(
+                        f"The `image_size` in `{sub_config_key}` is set to `{image_size}`, "
+                        f"but it does not match the required `patch_size` of `{patch_size}`. "
+                        f"Updating `image_size` to `{patch_size}` for consistency. "
+                        f"Ensure that `image_size` aligns with `patch_size` in the configuration."
+                    )
+                    sub_config.update({"image_size": patch_size})
+                sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config)
+            elif isinstance(sub_config, PretrainedConfig):
+                sub_config = sub_config
+                image_size = getattr(sub_config, "image_size", None)
+                if image_size != patch_size:
+                    raise ValueError(
+                        f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={patch_size}`."
+                    )
+            else:
+                raise TypeError(
+                    f"Invalid type for `sub_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(sub_config)}."
+                )
+
+            setattr(self, sub_config_key, sub_config)
+
+
+__all__ = ["DepthProConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py b/phivenv/Lib/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3686231b4fe35ea9f1ffc61534b60aee37be601
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -0,0 +1,389 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DepthPro."""
+
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ...utils.import_utils import requires
+
+
+if TYPE_CHECKING:
+    from .modeling_depth_pro import DepthProDepthEstimatorOutput
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_torchvision_available, logging, requires_backends
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("torchvision", "torch"))
+class DepthProImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DepthPro image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 1536, "width": 1536}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized images.
+        """
+        requires_backends(self, "torch")
+
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+
+        # we use torch interpolation instead of image.resize because DepthProImageProcessor
+        # rescales, then normalizes, which may cause some values to become negative, before resizing the image.
+        # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise,
+        # however pytorch interpolation works with negative values.
+        # relevant issue here: https://github.com/huggingface/transformers/issues/34920
+        # input should be (B, C, H, W)
+        image_tensor = torch.from_numpy(image).unsqueeze(0)
+        resized_image = torch.nn.functional.interpolate(
+            input=image_tensor,
+            size=output_size,
+            mode=pil_torch_interpolation_mapping[resample].value,
+        )
+        resized_image = resized_image.squeeze(0).numpy()
+        return resized_image
+
+    def _validate_input_arguments(
+        self,
+        do_resize: bool,
+        size: dict[str, int],
+        resample: PILImageResampling,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, list[float]],
+        image_std: Union[float, list[float]],
+        data_format: Union[str, ChannelDimension],
+    ):
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            data_format=data_format,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            # depth-pro rescales and normalizes the image before resizing it
+            # uses torch interpolation which requires ChannelDimension.FIRST
+            if do_resize:
+                image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                image = self.resize(image=image, size=size, resample=resample)
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
+            else:
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+            all_images.append(image)
+
+        data = {"pixel_values": all_images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthProDepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
+    ) -> list[dict[str, TensorType]]:
+        """
+        Post-processes the raw depth predictions from the model to generate
+        final depth predictions which is caliberated using the field of view if provided
+        and resized to specified target sizes if provided.
+
+        Args:
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`Optional[Union[TensorType, list[tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
+
+        Returns:
+            `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`.
+
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.field_of_view
+
+        batch_size = len(predicted_depth)
+
+        if target_sizes is not None and batch_size != len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
+            focal_length = None
+            if target_size is not None:
+                # scale image w.r.t fov
+                if fov_value is not None:
+                    width = target_size[1]
+                    focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / focal_length
+
+                # interpolate
+                depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
+                ).squeeze()
+
+            # inverse the depth
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "field_of_view": fov_value,
+                    "focal_length": focal_length,
+                }
+            )
+
+        return results
+
+
+__all__ = ["DepthProImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/phivenv/Lib/site-packages/transformers/models/depth_pro/image_processing_depth_pro_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..581577b5b25fdcb71be9ebaaf7369248f27f2caf
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/depth_pro/image_processing_depth_pro_fast.py
@@ -0,0 +1,177 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DepthPro."""
+
+from typing import TYPE_CHECKING, Optional, Union
+
+from ...image_processing_base import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling, SizeDict
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+    requires_backends,
+)
+from ...utils.import_utils import requires
+
+
+if TYPE_CHECKING:
+    from .modeling_depth_pro import DepthProDepthEstimatorOutput
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+@auto_docstring
+@requires(backends=("torchvision", "torch"))
+class DepthProImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 1536, "width": 1536}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+
+    # DepthPro resizes image after rescaling and normalizing,
+    # which makes it different from BaseImageProcessorFast._preprocess
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched scaling
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation,
+                    antialias=False,
+                )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthProDepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
+    ) -> list[dict[str, TensorType]]:
+        """
+        Post-processes the raw depth predictions from the model to generate
+        final depth predictions which is caliberated using the field of view if provided
+        and resized to specified target sizes if provided.
+
+        Args:
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`Optional[Union[TensorType, list[tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
+
+        Returns:
+            `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`.
+
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.field_of_view
+
+        batch_size = len(predicted_depth)
+
+        if target_sizes is not None and batch_size != len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
+            focal_length = None
+            if target_size is not None:
+                # scale image w.r.t fov
+                if fov_value is not None:
+                    width = target_size[1]
+                    focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / focal_length
+
+                # interpolate
+                depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
+                ).squeeze()
+
+            # inverse the depth
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "field_of_view": fov_value,
+                    "focal_length": focal_length,
+                }
+            )
+
+        return results
+
+
+__all__ = ["DepthProImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/depth_pro/modeling_depth_pro.py b/phivenv/Lib/site-packages/transformers/models/depth_pro/modeling_depth_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..52de04d42df778bc6f4e3ee0ced9d7629a7dfd92
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/depth_pro/modeling_depth_pro.py
@@ -0,0 +1,1132 @@
+# coding=utf-8
+# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DepthPro model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from ..auto import AutoModel
+from .configuration_depth_pro import DepthProConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for DepthPro's outputs.
+    """
+)
+class DepthProOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+    features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*):
+        Features from encoders. Can be a single feature or a list of features.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    features: Union[torch.FloatTensor, list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for DepthProForDepthEstimation's output.
+    """
+)
+class DepthProDepthEstimatorOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Classification (or regression if config.num_labels==1) loss.
+    field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
+        Field of View Scaler.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: Optional[torch.FloatTensor] = None
+    field_of_view: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
+    """Creates Patches from Batch."""
+    batch_size, num_channels, height, width = pixel_values.shape
+
+    if height == width == patch_size:
+        # create patches only if scaled image is not already equal to patch size
+        return pixel_values
+
+    stride = torch_int(patch_size * (1 - overlap_ratio))
+
+    patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
+    patches = patches.permute(2, 0, 1)
+    patches = patches.reshape(-1, num_channels, patch_size, patch_size)
+
+    return patches
+
+
+def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor:
+    """Discard class token and reshape 1D feature map to a 2D grid."""
+    n_samples, seq_len, hidden_size = hidden_states.shape
+    size = torch_int(seq_len**0.5)
+
+    hidden_states = hidden_states[:, -(size**2) :, :]  # remove special tokens if there are any
+    hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
+    hidden_states = hidden_states.permute(0, 3, 1, 2)
+
+    return hidden_states
+
+
+def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
+    """Merges smaller patches into image-like feature map."""
+    n_patches, hidden_size, out_size, out_size = patches.shape
+    n_patches_per_batch = n_patches // batch_size
+    sqrt_n_patches_per_batch = torch_int(n_patches_per_batch**0.5)
+    new_out_size = sqrt_n_patches_per_batch * out_size
+
+    if n_patches == batch_size:
+        # merge only if the patches were created from scaled image
+        # patches are not created when scaled image size is equal to patch size
+        return patches
+
+    if n_patches_per_batch < 4:
+        # for each batch, at least 4 small patches are required to
+        # recreate a large square patch from merging them and later padding is applied
+        # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
+        # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
+        # 5 x (8x8) patches becomes 1 x (16x16) patch (extra patch ignored, padding later)
+        # 9 x (8x8) patches becomes 1 x (24x24) patch (padding later)
+        # thus the following code only rearranges the patches and removes extra ones
+        padding = 0
+
+    # make sure padding is not large enough to remove more than half of the patch
+    padding = min(out_size // 4, padding)
+
+    if padding == 0:
+        # faster when no padding is required
+        merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+        merged = merged.permute(1, 2, 0, 3, 4)
+        merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
+        merged = merged.reshape(
+            batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
+        )
+        merged = merged.permute(0, 1, 2, 4, 3, 5)
+        merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
+    else:
+        # padding example:
+        # let out_size = 8, new_out_size = 32, padding = 2
+        # each patch is separated by "|"
+        # and padding is applied to the merging edges of each patch
+        # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
+        # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31
+        i = 0
+        boxes = []
+        for h in range(sqrt_n_patches_per_batch):
+            boxes_in_row = []
+            for w in range(sqrt_n_patches_per_batch):
+                box = patches[batch_size * i : batch_size * (i + 1)]
+
+                # collect paddings
+                paddings = [0, 0, 0, 0]
+                if h != 0:
+                    # remove pad from height if box is not at top border
+                    paddings[0] = padding
+                if w != 0:
+                    # remove pad from width if box is not at left border
+                    paddings[2] = padding
+                if h != sqrt_n_patches_per_batch - 1:
+                    # remove pad from height if box is not at bottom border
+                    paddings[1] = padding
+                if w != sqrt_n_patches_per_batch - 1:
+                    # remove pad from width if box is not at right border
+                    paddings[3] = padding
+
+                # remove paddings
+                _, _, box_h, box_w = box.shape
+                pad_top, pad_bottom, pad_left, pad_right = paddings
+                box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right]
+
+                boxes_in_row.append(box)
+                i += 1
+            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+            boxes.append(boxes_in_row)
+        merged = torch.cat(boxes, dim=-2)
+
+    return merged
+
+
+def reconstruct_feature_maps(
+    hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: tuple[float, float]
+) -> torch.Tensor:
+    """
+    Reconstructs feature maps from the hidden state produced by any of the encoder. Converts the hidden state of shape
+    `(n_patches_per_batch * batch_size, seq_len, hidden_size)` to feature maps of shape
+    `(batch_size, hidden_size, output_size[0], output_size[1])`.
+
+    Args:
+        hidden_state (torch.Tensor): Input tensor of shape `(n_patches_per_batch * batch_size, seq_len, hidden_size)`
+            representing the encoded patches.
+        batch_size (int): The number of samples in a batch.
+        padding (int): The amount of padding to be removed when merging patches.
+        output_size (tuple[float, float]): The desired output size for the feature maps, specified as `(height, width)`.
+
+    Returns:
+        torch.Tensor: Reconstructed feature maps of shape `(batch_size, hidden_size, output_size[0], output_size[1])`.
+    """
+    # reshape back to image like
+    features = reshape_features(hidden_state)
+
+    # merge all patches in a batch to create one large patch per batch
+    features = merge_patches(
+        features,
+        batch_size=batch_size,
+        padding=padding,
+    )
+
+    # interpolate patches to base size
+    features = F.interpolate(
+        features,
+        size=output_size,
+        mode="bilinear",
+        align_corners=False,
+    )
+
+    return features
+
+
+class DepthProPatchEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        self.intermediate_hook_ids = config.intermediate_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
+
+        self.n_scaled_images = len(config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(config.intermediate_hook_ids)
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        self.model = AutoModel.from_config(config.patch_model_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> list[torch.Tensor]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size:
+            raise ValueError(
+                f"Image size {height}x{width} is too small to be scaled "
+                f"with scaled_images_ratios={self.scaled_images_ratios} "
+                f"when patch_size={self.config.patch_size}."
+            )
+
+        # STEP 1: create 3-level image
+
+        scaled_images = []
+        for ratio in self.scaled_images_ratios:
+            scaled_images.append(
+                F.interpolate(
+                    pixel_values,
+                    scale_factor=ratio,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+            )
+
+        # STEP 2: create patches
+
+        for i in range(self.n_scaled_images):
+            scaled_images[i] = split_to_patches(
+                scaled_images[i],
+                patch_size=self.config.patch_size,
+                overlap_ratio=self.scaled_images_overlap_ratios[i],
+            )
+        n_patches_per_scaled_image = [len(i) for i in scaled_images]
+        patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
+
+        # STEP 3: apply patch encoder
+
+        encodings = self.model(
+            # each patch is processed as a separate batch
+            patches,
+            head_mask=head_mask,
+            # required for intermediate features
+            output_hidden_states=self.n_intermediate_hooks > 0,
+        )
+
+        scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1])
+        # -1 (reverse list) as patch encoder returns high res patches first, we need low res first
+        scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
+
+        scaled_images_features = []
+        for i in range(self.n_scaled_images):
+            hidden_state = scaled_images_last_hidden_state[i]
+            batch_size = batch_size
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i]))
+            output_height = base_height * 2**i
+            output_width = base_width * 2**i
+            features = reconstruct_feature_maps(
+                hidden_state,
+                batch_size=batch_size,
+                padding=padding,
+                output_size=(output_height, output_width),
+            )
+            scaled_images_features.append(features)
+
+        # STEP 5: get intermediate features - (1-2) in diagram
+
+        intermediate_features = []
+        for i in range(self.n_intermediate_hooks):
+            # +1 to correct index position as hidden_states contain embedding output as well
+            hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1]
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1]))
+            output_height = base_height * 2 ** (self.n_scaled_images - 1)
+            output_width = base_width * 2 ** (self.n_scaled_images - 1)
+            features = reconstruct_feature_maps(
+                hidden_state,
+                batch_size=batch_size,
+                padding=padding,
+                output_size=(output_height, output_width),
+            )
+            intermediate_features.append(features)
+
+        # STEP 7: combine all features
+        features = [*scaled_images_features, *intermediate_features]
+
+        return features
+
+
+class DepthProImageEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        self.model = AutoModel.from_config(config.image_model_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # scale the image for image_encoder
+        size = self.config.image_model_config.image_size
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(size, size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        features = reconstruct_feature_maps(
+            encodings[0],
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
+        )
+
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]  # ignore last_hidden_state and poooler output
+
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
+
+
+class DepthProEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.intermediate_hook_ids = config.intermediate_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
+
+        self.n_scaled_images = len(self.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.intermediate_hook_ids)
+
+        self.patch_encoder = DepthProPatchEncoder(config)
+        self.image_encoder = DepthProImageEncoder(config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        patch_features = self.patch_encoder(
+            pixel_values,
+            head_mask=head_mask,
+        )
+        image_encodings = self.image_encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_features = image_encodings[1]  # index 1 contains features
+
+        features = [image_features, *patch_features]
+
+        if not return_dict:
+            return (image_encodings[0], features) + image_encodings[2:]
+
+        return DepthProOutput(
+            last_hidden_state=image_encodings.last_hidden_state,
+            features=features,
+            hidden_states=image_encodings.hidden_states,
+            attentions=image_encodings.attentions,
+        )
+
+
+class DepthProFeatureUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        config: DepthProConfig,
+        input_dims: int,
+        intermediate_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        use_proj: bool = True,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList()
+
+        # create first projection layer
+        if use_proj:
+            proj = nn.Conv2d(
+                in_channels=input_dims,
+                out_channels=intermediate_dims,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(proj)
+
+        # create following upsample layers
+        for i in range(n_upsample_layers):
+            in_channels = intermediate_dims if i == 0 else output_dims
+            layer = nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(layer)
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            features = layer(features)
+        return features
+
+
+class DepthProFeatureUpsample(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.n_scaled_images = len(self.config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.config.intermediate_hook_ids)
+
+        # for image_features
+        self.image_block = DepthProFeatureUpsampleBlock(
+            config=config,
+            input_dims=config.image_model_config.hidden_size,
+            intermediate_dims=config.image_model_config.hidden_size,
+            output_dims=config.scaled_images_feature_dims[0],
+            n_upsample_layers=1,
+            use_proj=False,
+            bias=True,
+        )
+
+        # for scaled_images_features
+        self.scaled_images = nn.ModuleList()
+        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.scaled_images.append(block)
+
+        # for intermediate_features
+        self.intermediate = nn.ModuleList()
+        for i, feature_dims in enumerate(config.intermediate_feature_dims):
+            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=2 + i,
+            )
+            self.intermediate.append(block)
+
+    def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
+        features[0] = self.image_block(features[0])
+
+        for i in range(self.n_scaled_images):
+            features[i + 1] = self.scaled_images[i](features[i + 1])
+
+        for i in range(self.n_intermediate_hooks):
+            features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1])
+
+        return features
+
+
+class DepthProFeatureProjection(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.fusion_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
+
+    def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
+        projected_features = []
+        for i, projection in enumerate(self.projections):
+            upsampled_feature = projection(features[i])
+            projected_features.append(upsampled_feature)
+        return projected_features
+
+
+class DepthProNeck(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        self.feature_upsample = DepthProFeatureUpsample(config)
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.scaled_images_feature_dims[0] * 2,
+            out_channels=config.scaled_images_feature_dims[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.feature_projection = DepthProFeatureProjection(config)
+
+    def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
+        features = self.feature_upsample(features)
+        # global features = low res features + image features
+        global_features = torch.cat((features[1], features[0]), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        features = [global_features, *features[2:]]
+        features = self.feature_projection(features)
+        return features
+
+
+# General docstring
+
+
+@auto_docstring
+class DepthProPreTrainedModel(PreTrainedModel):
+    config: DepthProConfig
+    base_model_prefix = "depth_pro"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _no_split_modules = ["DepthProPreActResidualLayer"]
+    _keys_to_ignore_on_load_unexpected = ["fov_model.*"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@auto_docstring
+class DepthProModel(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DepthProEncoder(config)
+        self.neck = DepthProNeck(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.encoder.image_encoder.model.get_input_embeddings()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DepthProOutput]:
+        r"""
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, DepthProModel
+
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> checkpoint = "apple/DepthPro-hf"
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProModel.from_pretrained(checkpoint)
+
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     output = model(**inputs)
+
+        >>> output.last_hidden_state.shape
+        torch.Size([1, 35, 577, 1024])
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encodings = self.encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        features = encodings[1]  # index 1 contains features
+        features = self.neck(features)
+
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]
+
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro
+class DepthProPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[DepthProConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# except it uses deconv and skip_add and needs no interpolation
+class DepthProFeatureFusionLayer(nn.Module):
+    def __init__(self, config: DepthProConfig, use_deconv: bool = True):
+        super().__init__()
+        self.config = config
+        self.use_deconv = use_deconv
+
+        self.residual_layer1 = DepthProPreActResidualLayer(config)
+        self.residual_layer2 = DepthProPreActResidualLayer(config)
+
+        if self.use_deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=config.fusion_hidden_size,
+                out_channels=config.fusion_hidden_size,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if residual is not None:
+            residual = self.residual_layer1(residual)
+            hidden_state = hidden_state + residual
+
+        hidden_state = self.residual_layer2(hidden_state)
+        if self.use_deconv:
+            hidden_state = self.deconv(hidden_state)
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
+# with deconv and reversed layers
+class DepthProFeatureFusionStage(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        self.intermediate = nn.ModuleList()
+        for _ in range(self.num_layers - 1):
+            self.intermediate.append(DepthProFeatureFusionLayer(config))
+
+        # final layer does not require deconvolution
+        self.final = DepthProFeatureFusionLayer(config, use_deconv=False)
+
+    def forward(self, hidden_states: list[torch.Tensor]) -> list[torch.Tensor]:
+        if self.num_layers != len(hidden_states):
+            raise ValueError(
+                f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
+                f"does not match len(hidden_states)={len(hidden_states)}"
+            )
+
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states[:-1], self.intermediate):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+
+        hidden_state = hidden_states[-1]
+        fused_hidden_state = self.final(fused_hidden_state, hidden_state)
+        fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+class DepthProFovEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        self.model = AutoModel.from_config(config.fov_model_config)
+        self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # scale the image for fov_encoder
+        size = self.config.fov_model_config.image_size
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(size, size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+        )
+        hidden_state = encodings[0]
+        hidden_state = self.neck(hidden_state)
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        features = reconstruct_feature_maps(
+            hidden_state,
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
+        )
+
+        return features
+
+
+class DepthProFovHead(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        # create initial head layers
+        self.layers = nn.ModuleList()
+        for i in range(config.num_fov_head_layers):
+            self.layers.append(
+                nn.Conv2d(
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                )
+            )
+            self.layers.append(nn.ReLU(True))
+        # calculate expected shapes to finally generate a scalar output from final head layer
+        final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
+        final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
+        self.layers.append(
+            nn.Conv2d(
+                in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0
+            )
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = F.interpolate(
+            features,
+            size=(self.out_size, self.out_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        for layer in self.layers:
+            features = layer(features)
+        return features
+
+
+class DepthProFovModel(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+
+        self.fov_encoder = DepthProFovEncoder(config)
+        self.conv = nn.Conv2d(
+            self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1
+        )
+        self.activation = nn.ReLU(inplace=True)
+        self.head = DepthProFovHead(config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        global_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        fov_features = self.fov_encoder(pixel_values, head_mask)
+
+        global_features = self.conv(global_features)
+        global_features = self.activation(global_features)
+
+        fov_features = fov_features + global_features
+        fov_output = self.head(fov_features)
+        fov_output = fov_output.flatten()
+
+        return fov_output
+
+
+class DepthProDepthEstimationHead(nn.Module):
+    """
+    The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks.
+    This module comprises a sequence of convolutional and transposed convolutional layers
+    that process the feature map from the fusion to produce a single-channel depth map.
+    Key operations include dimensionality reduction and upsampling to match the input resolution.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        features = config.fusion_hidden_size
+        self.layers = nn.ModuleList(
+            [
+                nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+                nn.ConvTranspose2d(
+                    in_channels=features // 2,
+                    out_channels=features // 2,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=True,
+                ),
+                nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+
+        predicted_depth = hidden_states.squeeze(dim=1)
+        return predicted_depth
+
+
+@auto_docstring(
+    custom_intro="""
+    DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
+    """
+)
+class DepthProForDepthEstimation(DepthProPreTrainedModel):
+    def __init__(self, config, use_fov_model=None):
+        r"""
+        use_fov_model (bool, *optional*):
+            Whether to use the field of view model.
+        """
+        super().__init__(config)
+        self.config = config
+        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
+
+        # dinov2 (vit) like encoders
+        self.depth_pro = DepthProModel(config)
+
+        # dpt (vit) like fusion stage
+        self.fusion_stage = DepthProFeatureFusionStage(config)
+
+        # depth estimation head
+        self.head = DepthProDepthEstimationHead(config)
+
+        # dinov2 (vit) like encoder
+        self.fov_model = DepthProFovModel(config) if self.use_fov_model else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], DepthProDepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> checkpoint = "apple/DepthPro-hf"
+        >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
+
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>> model.to(device)
+
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt").to(device)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # interpolate to original size
+        >>> post_processed_output = processor.post_process_depth_estimation(
+        ...     outputs, target_sizes=[(image.height, image.width)],
+        ... )
+
+        >>> # get the field of view (fov) predictions
+        >>> field_of_view = post_processed_output[0]["field_of_view"]
+        >>> focal_length = post_processed_output[0]["focal_length"]
+
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        depth_pro_outputs = self.depth_pro(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        features = depth_pro_outputs.features
+        fused_hidden_states = self.fusion_stage(features)
+        predicted_depth = self.head(fused_hidden_states[-1])
+
+        if self.use_fov_model:
+            # frozen features from encoder are used
+            features_for_fov = features[0].detach()
+            fov = self.fov_model(
+                pixel_values=pixel_values,
+                global_features=features_for_fov,
+                head_mask=head_mask,
+            )
+        else:
+            fov = None
+
+        if not return_dict:
+            outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]
+            return tuple(v for v in outputs if v is not None)
+
+        return DepthProDepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            field_of_view=fov,
+            hidden_states=depth_pro_outputs.hidden_states,
+            attentions=depth_pro_outputs.attentions,
+        )
+
+
+__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"]
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/__init__.py b/phivenv/Lib/site-packages/transformers/models/detr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b8aae5e70381b21772bdec395693c007a9c02b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/detr/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_detr import *
+    from .feature_extraction_detr import *
+    from .image_processing_detr import *
+    from .image_processing_detr_fast import *
+    from .modeling_detr import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..620be5df7ee9bd57f5e9a6142d467d3e35f6e2f1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd7d2c91e0eb8a60ed5101b26d259b6880c90449
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d9103c5a0aa814d40f29117d29b0406edba6b5f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c61873b2c61ae3b4bdddc09a2b667afdaba6207
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3d2a9f7b815e2a7bee23654eccdd84a6c5a3010
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9298af5cd688d7fef2f2b8a03c041a002e77e0f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/configuration_detr.py b/phivenv/Lib/site-packages/transformers/models/detr/configuration_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9540382927cf2c9849b188ed4b53dea3315f4c4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/detr/configuration_detr.py
@@ -0,0 +1,297 @@
+# coding=utf-8
+# Copyright 2021 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DETR model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class DetrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DetrModel`]. It is used to instantiate a DETR
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DETR
+    [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_timm_backbone (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
+            case it will default to `ResNetConfig()`.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 100):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetrModel`] can
+            detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
+            This parameter is a general dimension parameter, defining dimensions for components such as the encoder layer and projection parameters in the decoder layer, among others.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `True`):
+            Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import DetrConfig, DetrModel
+
+    >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
+    >>> configuration = DetrConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/detr-resnet-50 style configuration
+    >>> model = DetrModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "detr"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        use_timm_backbone=True,
+        backbone_config=None,
+        num_channels=3,
+        num_queries=100,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        backbone="resnet50",
+        use_pretrained_backbone=True,
+        backbone_kwargs=None,
+        dilation=False,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        **kwargs,
+    ):
+        # We default to values which were previously hard-coded in the model. This enables configurability of the config
+        # while keeping the default behavior the same.
+        if use_timm_backbone and backbone_kwargs is None:
+            backbone_kwargs = {}
+            if dilation:
+                backbone_kwargs["output_stride"] = 16
+            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
+            backbone_kwargs["in_chans"] = num_channels
+        # Backwards compatibility
+        elif not use_timm_backbone and backbone in (None, "resnet50"):
+            if backbone_config is None:
+                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+            elif isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+            backbone = None
+            # set timm attributes to None
+            dilation = None
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
+        self.num_channels = num_channels
+        self.num_queries = num_queries
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.num_hidden_layers = encoder_layers
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.dilation = dilation
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+    @classmethod
+    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`DetrConfig`] (or a derived class) from a pre-trained backbone model configuration.
+
+        Args:
+            backbone_config ([`PretrainedConfig`]):
+                The backbone configuration.
+        Returns:
+            [`DetrConfig`]: An instance of a configuration object
+        """
+        return cls(backbone_config=backbone_config, **kwargs)
+
+
+class DetrOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("pixel_mask", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
+
+
+__all__ = ["DetrConfig", "DetrOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/feature_extraction_detr.py b/phivenv/Lib/site-packages/transformers/models/detr/feature_extraction_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..a81f83c8c313bdb8a904f0b359360c0e100a83d9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/detr/feature_extraction_detr.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DETR."""
+
+import warnings
+
+from ...image_transforms import rgb_to_id as _rgb_to_id
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_detr import DetrImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+def rgb_to_id(x):
+    warnings.warn(
+        "rgb_to_id has moved and will not be importable from this module from v5. "
+        "Please import from transformers.image_transforms instead.",
+        FutureWarning,
+    )
+    return _rgb_to_id(x)
+
+
+@requires(backends=("vision",))
+class DetrFeatureExtractor(DetrImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class DetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use DetrImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["DetrFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/image_processing_detr.py b/phivenv/Lib/site-packages/transformers/models/detr/image_processing_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe4d85304ab47e0fe7b3d10fd1e5bb993c2f2a9e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/detr/image_processing_detr.py
@@ -0,0 +1,2049 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_annotations,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    is_flax_available,
+    is_jax_tensor,
+    is_scipy_available,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+if is_vision_available():
+    import PIL
+
+
+if is_scipy_available():
+    import scipy.special
+    import scipy.stats
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    return (oh, ow)
+
+
+def get_image_size_for_max_height_width(
+    input_image: np.ndarray,
+    max_height: int,
+    max_width: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, tuple[int, int], list[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `tuple[int, int]` or `list[int]`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width
+def get_max_height_width(
+    images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`list[list[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> dict:
+    """
+    Prepare a coco panoptic annotation for DETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+def get_segmentation_image(
+    masks: np.ndarray, input_size: tuple, target_size: tuple, stuff_equiv_classes, deduplicate=False
+):
+    h, w = input_size
+    final_h, final_w = target_size
+
+    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
+
+    if m_id.shape[-1] == 0:
+        # We didn't detect any mask :(
+        m_id = np.zeros((h, w), dtype=np.int64)
+    else:
+        m_id = m_id.argmax(-1).reshape(h, w)
+
+    if deduplicate:
+        # Merge the masks corresponding to the same stuff class
+        for equiv in stuff_equiv_classes.values():
+            for eq_id in equiv:
+                m_id[m_id == eq_id] = equiv[0]
+
+    seg_img = id_to_rgb(m_id)
+    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
+    return seg_img
+
+
+def get_mask_area(seg_img: np.ndarray, target_size: tuple[int, int], n_classes: int) -> np.ndarray:
+    final_h, final_w = target_size
+    np_seg_img = seg_img.astype(np.uint8)
+    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
+    m_id = rgb_to_id(np_seg_img)
+    area = [(m_id == i).sum() for i in range(n_classes)]
+    return area
+
+
+def score_labels_from_class_probabilities(logits: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    probs = scipy.special.softmax(logits, axis=-1)
+    labels = probs.argmax(-1, keepdims=True)
+    scores = np.take_along_axis(probs, labels, axis=-1)
+    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+    return scores, labels
+
+
+def post_process_panoptic_sample(
+    out_logits: np.ndarray,
+    masks: np.ndarray,
+    boxes: np.ndarray,
+    processed_size: tuple[int, int],
+    target_size: tuple[int, int],
+    is_thing_map: dict,
+    threshold=0.85,
+) -> dict:
+    """
+    Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.
+
+    Args:
+        out_logits (`torch.Tensor`):
+            The logits for this sample.
+        masks (`torch.Tensor`):
+            The predicted segmentation masks for this sample.
+        boxes (`torch.Tensor`):
+            The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
+            width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
+        processed_size (`tuple[int, int]`):
+            The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
+            after data augmentation but before batching.
+        target_size (`tuple[int, int]`):
+            The target size of the image, `(height, width)` corresponding to the requested final size of the
+            prediction.
+        is_thing_map (`Dict`):
+            A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
+        threshold (`float`, *optional*, defaults to 0.85):
+            The threshold used to binarize the segmentation masks.
+    """
+    # we filter empty queries and detection below threshold
+    scores, labels = score_labels_from_class_probabilities(out_logits)
+    keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
+
+    cur_scores = scores[keep]
+    cur_classes = labels[keep]
+    cur_boxes = center_to_corners_format(boxes[keep])
+
+    if len(cur_boxes) != len(cur_classes):
+        raise ValueError("Not as many boxes as there are classes")
+
+    cur_masks = masks[keep]
+    cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
+    cur_masks = safe_squeeze(cur_masks, 1)
+    b, h, w = cur_masks.shape
+
+    # It may be that we have several predicted masks for the same stuff class.
+    # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+    cur_masks = cur_masks.reshape(b, -1)
+    stuff_equiv_classes = defaultdict(list)
+    for k, label in enumerate(cur_classes):
+        if not is_thing_map[label]:
+            stuff_equiv_classes[label].append(k)
+
+    seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
+    area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
+
+    # We filter out any mask that is too small
+    if cur_classes.size() > 0:
+        # We know filter empty masks as long as we find some
+        filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+        while filtered_small.any():
+            cur_masks = cur_masks[~filtered_small]
+            cur_scores = cur_scores[~filtered_small]
+            cur_classes = cur_classes[~filtered_small]
+            seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
+            area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
+            filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+    else:
+        cur_classes = np.ones((1, 1), dtype=np.int64)
+
+    segments_info = [
+        {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
+        for i, (cat, a) in enumerate(zip(cur_classes, area))
+    ]
+    del cur_classes
+
+    with io.BytesIO() as out:
+        PIL.Image.fromarray(seg_img).save(out, format="PNG")
+        predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+
+    return predictions
+
+
+def resize_annotation(
+    annotation: dict[str, Any],
+    orig_size: tuple[int, int],
+    target_size: tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`tuple[int, int]`):
+            The original size of the input image.
+        target_size (`tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+# TODO - (Amy) make compatible with other frameworks
+def binary_mask_to_rle(mask):
+    """
+    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        mask (`torch.Tensor` or `numpy.array`):
+            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+            segment_id or class_id.
+    Returns:
+        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+        format.
+    """
+    if is_torch_tensor(mask):
+        mask = mask.numpy()
+
+    pixels = mask.flatten()
+    pixels = np.concatenate([[0], pixels, [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+    return list(runs)
+
+
+# TODO - (Amy) make compatible with other frameworks
+def convert_segmentation_to_rle(segmentation):
+    """
+    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        segmentation (`torch.Tensor` or `numpy.array`):
+            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+    Returns:
+        `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+    """
+    segment_ids = torch.unique(segmentation)
+
+    run_length_encodings = []
+    for idx in segment_ids:
+        mask = torch.where(segmentation == idx, 1, 0)
+        rle = binary_mask_to_rle(mask)
+        run_length_encodings.append(rle)
+
+    return run_length_encodings
+
+
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_area = (mask_probs[k] >= mask_threshold).sum()
+    mask_exists = mask_k_area > 0 and original_area > 0
+
+    # Eliminate disconnected tiny segments
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, mask_k
+
+
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[set[int]] = None,
+    target_size: Optional[tuple[int, int]] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: list[dict] = []
+
+    if target_size is not None:
+        mask_probs = nn.functional.interpolate(
+            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+        )[0]
+
+    current_segment_id = 0
+
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+
+    # Keep track of instances of each class
+    stuff_memory_list: dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+
+    return segmentation, segments
+
+
+@requires(backends=("vision",))
+class DetrImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Detr image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to True):
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self._valid_processor_keys = [
+            "images",
+            "annotations",
+            "return_segmentation_masks",
+            "masks_path",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "do_convert_annotations",
+            "image_mean",
+            "image_std",
+            "do_pad",
+            "pad_size",
+            "format",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into DETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            new_size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "max_height" in size and "max_width" in size:
+            new_size = get_image_size_for_max_height_width(
+                image, size["max_height"], size["max_width"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image,
+            size=new_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # TODO (Amy) - update to use `rescale_factor` instead of `scale`
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
+
+    def pad(
+        self,
+        images: list[np.ndarray],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            images (list[`np.ndarray`]):
+                Images to pad.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                Annotations to transform according to the padding that is applied to the images.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        if pad_size is not None:
+            padded_size = (pad_size["height"], pad_size["width"])
+        else:
+            padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
+                image,
+                padded_size,
+                annotation,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
+            )
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            size = kwargs.pop("max_size")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=True,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=do_convert_annotations,
+                return_tensors=return_tensors,
+                pad_size=pad_size,
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS - TODO: add support for other frameworks
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+        return results
+
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_semantic_segmentation`.",
+        )
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+
+            predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports
+        PyTorch.
+
+        Args:
+            results (`list[Dict]`):
+                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_instance_segmentation`.",
+        )
+
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction.
+                If left to None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_panoptic_segmentation`.",
+        )
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_labels):
+                raise ValueError("Not as many boxes as there are classes")
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_labels):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_labels.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_labels = cur_labels[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_labels[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_labels
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, list):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+    def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple[int, int]]] = None):
+        """
+        Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`list[tuple[int, int]]`, *optional*):
+                A list of tuples (`tuple[int, int]`) containing the target size (height, width) of each image in the
+                batch. If unset, predictions will not be resized.
+        Returns:
+            `list[torch.Tensor]`:
+                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
+                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
+                `torch.Tensor` correspond to a semantic class id.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Semantic segmentation logits of shape (batch_size, num_classes, height, width)
+        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+        batch_size = class_queries_logits.shape[0]
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            semantic_segmentation = []
+            for idx in range(batch_size):
+                resized_logits = nn.functional.interpolate(
+                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = segmentation.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        target_sizes: Optional[list[tuple[int, int]]] = None,
+        return_coco_annotation: Optional[bool] = False,
+    ) -> list[dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            target_sizes (`list[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction. If unset, predictions will not be resized.
+            return_coco_annotation (`bool`, *optional*):
+                Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
+                format.
+        Returns:
+            `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `list[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
+              `True`. Set to `None` if no mask if found above `threshold`.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- An integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: list[dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=[],
+                target_size=target_size,
+            )
+
+            # Return segmentation map in run-length encoding (RLE) format
+            if return_coco_annotation:
+                segmentation = convert_segmentation_to_rle(segmentation)
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        label_ids_to_fuse: Optional[set[int]] = None,
+        target_sizes: Optional[list[tuple[int, int]]] = None,
+    ) -> list[dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports
+        PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                The outputs from [`DetrForSegmentation`].
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            label_ids_to_fuse (`Set[int]`, *optional*):
+                The labels in this state will have all their instances be fused together. For instance we could say
+                there can only be one sky in an image, but several persons, so the label ID for sky would be in that
+                set, but not the one for person.
+            target_sizes (`list[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction in batch. If unset, predictions will not be resized.
+        Returns:
+            `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to
+              the corresponding `target_sizes` entry.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- an integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
+                  Multiple instances of the same class / label were fused and assigned a single `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+
+        if label_ids_to_fuse is None:
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
+            label_ids_to_fuse = set()
+
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: list[dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
+            )
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+
+__all__ = ["DetrImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/image_processing_detr_fast.py b/phivenv/Lib/site-packages/transformers/models/detr/image_processing_detr_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..37eef3717a9aecfbbdf3497495b294357ce0c6d7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/detr/image_processing_detr_fast.py
@@ -0,0 +1,1291 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from typing import Any, Optional, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format, id_to_rgb
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+    logging,
+)
+from ...utils.import_utils import requires
+from .image_processing_detr import (
+    compute_segments,
+    convert_segmentation_to_rle,
+    get_size_with_aspect_ratio,
+    remove_low_and_no_objects,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+if is_vision_available():
+    import PIL
+
+
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`list[list[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
+        mask = torch.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, axis=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
+
+    return masks
+
+
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETR.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
+    x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = torch.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * torch.unsqueeze(x, 0)
+    x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
+    x_min = (
+        torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    y_mask = masks * torch.unsqueeze(y, 0)
+    y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
+    y_min = (
+        torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, torch.Tensor) and len(color.shape) == 3:
+        if color.dtype == torch.uint8:
+            color = color.to(torch.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def prepare_coco_panoptic_annotation(
+    image: torch.Tensor,
+    target: dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> dict:
+    """
+    Prepare a coco panoptic annotation for DETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = torch.as_tensor(
+        [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
+    )
+    new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+    new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+
+    if "segments_info" in target:
+        masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
+        masks = rgb_to_id(masks)
+
+        ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
+        masks = masks == ids[:, None, None]
+        masks = masks.to(torch.bool)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = torch.as_tensor(
+            [segment_info["category_id"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["iscrowd"] = torch.as_tensor(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["area"] = torch.as_tensor(
+            [segment_info["area"] for segment_info in target["segments_info"]],
+            dtype=torch.float32,
+            device=image.device,
+        )
+
+    return new_target
+
+
+class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    do_pad (`bool`, *optional*, defaults to `True`):
+        Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+        method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+        If `pad_size` is provided, the image will be padded to the specified dimensions.
+        Otherwise, the image will be padded to the maximum height and width of the batch.
+    pad_size (`dict[str, int]`, *optional*):
+        The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+        provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+        height and width in the batch.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    do_pad: Optional[bool]
+    pad_size: Optional[dict[str, int]]
+    return_segmentation_masks: Optional[bool]
+
+
+@auto_docstring
+@requires(backends=("torchvision", "torch"))
+class DetrImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = True
+    size = {"shortest_edge": 800, "longest_edge": 1333}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = DetrFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+
+        size = kwargs.pop("size", None)
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into DETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation: dict[str, Any],
+        orig_size: tuple[int, int],
+        target_size: tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`tuple[int, int]`):
+                The original size of the input image.
+            target_size (`tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[DetrFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images. If annotation is for object
+            detection, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                dictionary. An image can have no annotations, in which case the list should be empty.
+            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                An image can have no segments, in which case the list should be empty.
+            - "file_name" (`str`): The file name of the image.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            kwargs["size"] = kwargs.pop("max_size")
+
+        return super().preprocess(images, annotations, masks_path, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]],
+        masks_path: Optional[Union[str, pathlib.Path]],
+        return_segmentation_masks: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: bool,
+        pad_size: Optional[dict[str, int]],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        data = {}
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+        return results
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_segmentation
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_semantic_segmentation`.",
+        )
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+
+            predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports
+        PyTorch.
+
+        Args:
+            results (`list[Dict]`):
+                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_instance_segmentation`.",
+        )
+
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction.
+                If left to None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_panoptic_segmentation`.",
+        )
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_labels):
+                raise ValueError("Not as many boxes as there are classes")
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_labels):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_labels.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_labels = cur_labels[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_labels[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_labels
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, list):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_semantic_segmentation
+    def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple[int, int]]] = None):
+        """
+        Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`list[tuple[int, int]]`, *optional*):
+                A list of tuples (`tuple[int, int]`) containing the target size (height, width) of each image in the
+                batch. If unset, predictions will not be resized.
+        Returns:
+            `list[torch.Tensor]`:
+                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
+                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
+                `torch.Tensor` correspond to a semantic class id.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Semantic segmentation logits of shape (batch_size, num_classes, height, width)
+        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+        batch_size = class_queries_logits.shape[0]
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            semantic_segmentation = []
+            for idx in range(batch_size):
+                resized_logits = nn.functional.interpolate(
+                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = segmentation.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_instance_segmentation
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        target_sizes: Optional[list[tuple[int, int]]] = None,
+        return_coco_annotation: Optional[bool] = False,
+    ) -> list[dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            target_sizes (`list[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction. If unset, predictions will not be resized.
+            return_coco_annotation (`bool`, *optional*):
+                Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
+                format.
+        Returns:
+            `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `list[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
+              `True`. Set to `None` if no mask if found above `threshold`.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- An integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: list[dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=[],
+                target_size=target_size,
+            )
+
+            # Return segmentation map in run-length encoding (RLE) format
+            if return_coco_annotation:
+                segmentation = convert_segmentation_to_rle(segmentation)
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_panoptic_segmentation
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        label_ids_to_fuse: Optional[set[int]] = None,
+        target_sizes: Optional[list[tuple[int, int]]] = None,
+    ) -> list[dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports
+        PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                The outputs from [`DetrForSegmentation`].
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            label_ids_to_fuse (`Set[int]`, *optional*):
+                The labels in this state will have all their instances be fused together. For instance we could say
+                there can only be one sky in an image, but several persons, so the label ID for sky would be in that
+                set, but not the one for person.
+            target_sizes (`list[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction in batch. If unset, predictions will not be resized.
+        Returns:
+            `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to
+              the corresponding `target_sizes` entry.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- an integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
+                  Multiple instances of the same class / label were fused and assigned a single `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+
+        if label_ids_to_fuse is None:
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
+            label_ids_to_fuse = set()
+
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: list[dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
+            )
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+
+__all__ = ["DetrImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/detr/modeling_detr.py b/phivenv/Lib/site-packages/transformers/models/detr/modeling_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..86835ca62cfc02f7e2f8f134c9824c3eda1f31e5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/detr/modeling_detr.py
@@ -0,0 +1,1693 @@
+# coding=utf-8
+# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DETR model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    is_timm_available,
+    logging,
+    requires_backends,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_detr import DetrConfig
+
+
+if is_timm_available():
+    from timm import create_model
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    """
+)
+class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
+    r"""
+    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+        used to compute the weighted average in the cross-attention heads.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+        layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    """
+)
+class DetrModelOutput(Seq2SeqModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+        layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`DetrForObjectDetection`].
+    """
+)
+class DetrObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized bounding boxes.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`DetrForSegmentation`].
+    """
+)
+class DetrSegmentationOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized bounding boxes.
+    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
+        Segmentation masks logits for all queries. See also
+        [`~DetrImageProcessor.post_process_semantic_segmentation`] or
+        [`~DetrImageProcessor.post_process_instance_segmentation`]
+        [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
+        segmentation masks respectively.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    pred_masks: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+class DetrFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetrFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class DetrConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
+        if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
+            requires_backends(self, ["timm"])
+            kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs.copy()
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
+            if config.dilation:
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=out_indices,
+                in_chans=num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
+
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+
+        backbone_model_type = None
+        if config.backbone is not None:
+            backbone_model_type = config.backbone
+        elif config.backbone_config is not None:
+            backbone_model_type = config.backbone_config.model_type
+        else:
+            raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+class DetrConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+class DetrSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class DetrLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = DetrLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]):
+        return tensor if object_queries is None else tensor + object_queries
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        spatial_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size, target_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if object_queries is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, object_queries)
+
+        # add key-value position embeddings to the key value states
+        if spatial_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            if attention_mask.dtype == torch.bool:
+                attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_(
+                    attention_mask, -torch.inf
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class DetrEncoderLayer(nn.Module):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        object_queries: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            object_queries (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            object_queries=object_queries,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DetrDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = DetrAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            object_queries (`torch.FloatTensor`, *optional*):
+                object_queries that are added to the hidden states
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            object_queries=query_position_embeddings,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                object_queries=query_position_embeddings,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                spatial_position_embeddings=object_queries,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+@auto_docstring
+class DetrPreTrainedModel(PreTrainedModel):
+    config: DetrConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+
+        if isinstance(module, DetrMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, DetrLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class DetrEncoder(DetrPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`DetrEncoderLayer`].
+
+    The encoder updates the flattened feature map through multiple self-attention layers.
+
+    Small tweak for DETR:
+
+    - object_queries are added to the forward pass.
+
+    Args:
+        config: DetrConfig
+    """
+
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        object_queries=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Object queries that are added to the queries in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                # we add object_queries as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    object_queries=object_queries,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class DetrDecoder(DetrPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for DETR:
+
+    - object_queries and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: DetrConfig
+    """
+
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        object_queries=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Object queries that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the values and keys in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            combined_attention_mask = combined_attention_mask + _prepare_4d_attention_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                combined_attention_mask,
+                object_queries,
+                query_position_embeddings,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate]
+                if v is not None
+            )
+        return DetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """
+)
+class DetrModel(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = DetrConvEncoder(config)
+        object_queries = build_position_encoding(config)
+        self.backbone = DetrConvModel(backbone, object_queries)
+
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = DetrEncoder(config)
+        self.decoder = DetrDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetrModelOutput]:
+        r"""
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetrModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> # the last hidden states are the final query embeddings of the Transformer decoder
+        >>> # these are of shape (batch_size, num_queries, hidden_size)
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 100, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, object_queries_list = self.backbone(pixel_values, pixel_mask)
+
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                object_queries=object_queries,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            object_queries=object_queries,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return DetrModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+        )
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class DetrMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """
+)
+class DetrForObjectDetection(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # DETR encoder-decoder model
+        self.model = DetrModel(config)
+
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels + 1
+        )  # We add one for the "no object" class
+        self.bbox_predictor = DetrMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetrObjectDetectionOutput]:
+        r"""
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetrForObjectDetection
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
+        Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
+        Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
+        Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
+        Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+        pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            outputs_class, outputs_coord = None, None
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return DetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+    """
+)
+class DetrForSegmentation(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # object detection model
+        self.detr = DetrForObjectDetection(config)
+
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes
+
+        self.mask_head = DetrMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+
+        self.bbox_attention = DetrMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetrSegmentationOutput]:
+        r"""
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
+
+        Examples:
+
+        ```python
+        >>> import io
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> import numpy
+
+        >>> from transformers import AutoImageProcessor, DetrForSegmentation
+        >>> from transformers.image_transforms import rgb_to_id
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
+        >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
+        >>> # Segmentation results are returned as a list of dictionaries
+        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])
+
+        >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
+        >>> panoptic_seg = result[0]["segmentation"]
+        >>> # Get prediction score and segment_id to class_id mapping of each segment
+        >>> panoptic_segments_info = result[0]["segments_info"]
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+
+        # First, get list of feature maps and position embeddings
+        features, object_queries_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.detr.model.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                object_queries=object_queries,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            object_queries=object_queries,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.detr.bbox_predictor(sequence_output).sigmoid()
+
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+
+        pred_masks = seg_masks.view(batch_size, self.detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            outputs_class, outputs_coord = None, None
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.detr.class_labels_classifier(intermediate)
+                outputs_coord = self.detr.bbox_predictor(intermediate).sigmoid()
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits, labels, device, pred_boxes, pred_masks, self.config, outputs_class, outputs_coord
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return DetrSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+class DetrMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        if dim % 8 != 0:
+            raise ValueError(
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
+                " GroupNorm is set to 8"
+            )
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+
+        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = nn.GroupNorm(8, dim)
+        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
+        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
+        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
+        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
+        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: list[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, height/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = nn.functional.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = nn.functional.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+class DetrMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+        if mask is not None:
+            weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
+        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+__all__ = [
+    "DetrForObjectDetection",
+    "DetrForSegmentation",
+    "DetrModel",
+    "DetrPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__init__.py b/phivenv/Lib/site-packages/transformers/models/dia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d738fbc087888597da19735271366d4e35ab708c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dia import *
+    from .feature_extraction_dia import *
+    from .generation_dia import *
+    from .modeling_dia import *
+    from .processing_dia import *
+    from .tokenization_dia import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b86638f91975e849ae15f77827099627ec500ad
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/configuration_dia.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/configuration_dia.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b995edaef41300cdf31ab0d9d13f349b4ba69aa
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/configuration_dia.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/feature_extraction_dia.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/feature_extraction_dia.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56d5b4abedbef8267c685dd80020c24c04073d00
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/feature_extraction_dia.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/generation_dia.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/generation_dia.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d363cc5f0b625a9a7d90c7f348d926f2552dae8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/generation_dia.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/modeling_dia.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/modeling_dia.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..919d3b1df959ca5be1cda2460707e596ec00f4f3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/modeling_dia.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/modular_dia.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/modular_dia.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..083ee08bd266208401b7708c1b42df51e1f9416c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/modular_dia.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/processing_dia.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/processing_dia.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fb5060b340072039ebd94c8f7797b2102b9cae3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/processing_dia.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/tokenization_dia.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/tokenization_dia.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5414e8dd102ef830f2b3d1273bdabd7d08ff2fe9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dia/__pycache__/tokenization_dia.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/configuration_dia.py b/phivenv/Lib/site-packages/transformers/models/dia/configuration_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4dec60b3e4853574e4d528e7b641507a8c0b414
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/configuration_dia.py
@@ -0,0 +1,376 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dia model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiaEncoder`]. It is used to instantiate a Dia
+    encoder according to the specified arguments, defining the encoder architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of key and value heads for each attention layer in the Transformer encoder.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimensionality of the attention head.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        vocab_size (`int`, *optional*, defaults to 256):
+            Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiaModel`].
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"swish"` and `"gelu_new"` are supported.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "dia_encoder"
+
+    def __init__(
+        self,
+        max_position_embeddings: int = 1024,
+        num_hidden_layers: int = 12,
+        hidden_size: int = 1024,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 128,
+        intermediate_size: int = 4096,
+        norm_eps: float = 1e-5,
+        vocab_size: int = 256,
+        hidden_act: str = "silu",
+        rope_theta: float = 10000.0,
+        rope_scaling: Optional[dict] = None,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.norm_eps = norm_eps
+        self.vocab_size = vocab_size
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
+
+
+class DiaDecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiaDecoder`]. It is used to instantiate a Dia
+    decoder according to the specified arguments, defining the decoder architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        max_position_embeddings (`int`, *optional*, defaults to 3072):
+            The maximum sequence length that this model might ever be used with.
+        num_hidden_layers (`int`, *optional*, defaults to 18):
+            Number of hidden layers in the Transformer decoder.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the decoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            Number of key and value heads for each attention layer in the Transformer decoder.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimensionality of the attention head.
+        cross_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each cross-attention layer in the Transformer decoder.
+        cross_head_dim (`int`, *optional*, defaults to 128):
+            Dimensionality of the cross-attention head.
+        cross_num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of key and value heads for each cross-attention layer in the Transformer decoder.
+        cross_hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the cross-attention layers.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        vocab_size (`int`, *optional*, defaults to 1028):
+            Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiaModel`].
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, `"relu"`,
+            `"swish"` and `"gelu_new"` are supported.
+        num_channels (`int`, *optional*, defaults to 9):
+            Number of channels for the Dia decoder.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Indicating that this model is part of an encoder-decoder architecture.
+    """
+
+    model_type = "dia_decoder"
+
+    def __init__(
+        self,
+        max_position_embeddings: int = 3072,
+        num_hidden_layers: int = 18,
+        hidden_size: int = 2048,
+        intermediate_size: int = 8192,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 4,
+        head_dim: int = 128,
+        cross_num_attention_heads: int = 16,
+        cross_head_dim: int = 128,
+        cross_num_key_value_heads: int = 16,
+        cross_hidden_size: int = 1024,
+        norm_eps: float = 1e-5,
+        vocab_size: int = 1028,
+        hidden_act: str = "silu",
+        num_channels: int = 9,
+        rope_theta: float = 10000.0,
+        rope_scaling: Optional[dict] = None,
+        initializer_range: float = 0.02,
+        use_cache: bool = True,
+        is_encoder_decoder: bool = True,
+        **kwargs,
+    ):
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.cross_num_key_value_heads = cross_num_key_value_heads
+        self.cross_num_attention_heads = cross_num_attention_heads
+        self.cross_head_dim = cross_head_dim
+        self.cross_hidden_size = cross_hidden_size
+        self.norm_eps = norm_eps
+        self.vocab_size = vocab_size
+        self.hidden_act = hidden_act
+        self.num_channels = num_channels
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+
+class DiaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiaModel`]. It is used to instantiate a
+    Dia model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [nari-labs/Dia-1.6B](https://huggingface.co/nari-labs/Dia-1.6B) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        encoder_config (`DiaEncoderConfig`, *optional*):
+            Configuration for the encoder part of the model. If not provided, a default `DiaEncoderConfig` will be used.
+        decoder_config (`DiaDecoderConfig`, *optional*):
+            Configuration for the decoder part of the model. If not provided, a default `DiaDecoderConfig` will be used.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Indicating that this model uses an encoder-decoder architecture.
+        pad_token_id (`int`, *optional*, defaults to 1025):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1024):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 1026):
+            Beginning of stream token id.
+        delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`):
+            The delay pattern for the decoder. The length of this list must match `decoder_config.num_channels`.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from transformers import DiaConfig, DiaModel
+
+    >>> # Initializing a DiaConfig with default values
+    >>> configuration = DiaConfig()
+
+    >>> # Initializing a DiaModel (with random weights) from the configuration
+    >>> model = DiaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dia"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"encoder_config": DiaEncoderConfig, "decoder_config": DiaDecoderConfig}
+
+    def __init__(
+        self,
+        encoder_config: Optional[DiaEncoderConfig] = None,
+        decoder_config: Optional[DiaDecoderConfig] = None,
+        norm_eps: float = 1e-5,
+        is_encoder_decoder: bool = True,
+        pad_token_id: int = 1025,
+        eos_token_id: int = 1024,
+        bos_token_id: int = 1026,
+        delay_pattern: Optional[list[int]] = None,
+        initializer_range: float = 0.02,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        if isinstance(encoder_config, dict):
+            encoder_config = DiaEncoderConfig(**encoder_config)
+        if isinstance(decoder_config, dict):
+            decoder_config = DiaDecoderConfig(**decoder_config)
+        self.encoder_config = encoder_config if encoder_config is not None else DiaEncoderConfig()
+        self.decoder_config = decoder_config if decoder_config is not None else DiaDecoderConfig()
+        self.norm_eps = norm_eps
+        self.delay_pattern = delay_pattern if delay_pattern is not None else [0, 8, 9, 10, 11, 12, 13, 14, 15]
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        assert self.decoder_config.num_channels == len(self.delay_pattern), (
+            "Number of channels must match delay pattern length."
+        )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            bos_token_id=bos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+    def get_text_config(self, *args, **kwargs):
+        """Defaulting to audio config as it's the decoder in this case which is usually the text backbone"""
+        return self.decoder_config
+
+
+__all__ = ["DiaConfig", "DiaEncoderConfig", "DiaDecoderConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/feature_extraction_dia.py b/phivenv/Lib/site-packages/transformers/models/dia/feature_extraction_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d03ceff37f96a2fb1d5963c237147fa6975e2b8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/feature_extraction_dia.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Dia"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an Dia feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used for padding.
+        hop_length (`int`, *optional*, defaults to 512):
+            Overlap length between successive windows.
+    """
+
+    model_input_names = ["input_values", "n_quantizers"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 16000,
+        padding_value: float = 0.0,
+        hop_length: int = 512,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # convert stereo to mono if necessary, unique to Dia
+        for idx, example in enumerate(raw_audio):
+            if self.feature_size == 2 and example.ndim == 2:
+                raw_audio[idx] = np.mean(example, -1)
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.ndim != 1:  # note the conversion before
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        input_values = BatchFeature({"input_values": raw_audio})
+
+        # temporarily treat it as if we were mono as we also convert stereo to mono
+        origingal_feature_size = self.feature_size
+        self.feature_size = 1
+
+        # normal padding on batch
+        padded_inputs = self.pad(
+            input_values,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+            return_attention_mask=True,
+            pad_to_multiple_of=self.hop_length,
+        )
+        padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        # rewrite back to original feature size
+        self.feature_size = origingal_feature_size
+
+        return padded_inputs
+
+
+__all__ = ["DiaFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/generation_dia.py b/phivenv/Lib/site-packages/transformers/models/dia/generation_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cac22f0d4838af70d2a18241ed14d5d90c925f9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/generation_dia.py
@@ -0,0 +1,464 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+
+from ...generation.logits_process import (
+    DiaClassifierFreeGuidanceLogitsProcessor,
+    DiaEOSChannelFilterLogitsProcessor,
+    DiaEOSDelayPatternLogitsProcessor,
+    LogitsProcessorList,
+    TemperatureLogitsWarper,
+)
+from ...generation.stopping_criteria import StoppingCriteriaList
+from ...generation.streamers import BaseStreamer
+from ...generation.utils import GenerateOutput, GenerationConfig, GenerationMixin, GenerationMode
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaGenerationMixin(GenerationMixin):
+    # Indicates CFG which needs preparation to be properly handled by repeats
+    _uses_cfg = None
+
+    def _get_logits_processor(
+        self,
+        generation_config: GenerationConfig,
+        input_ids_seq_length: Optional[int] = None,
+        encoder_input_ids: torch.LongTensor = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        device: Optional[str] = None,
+        model_kwargs: Optional[dict[str, Any]] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorList:
+        # Need either custom order or custom processor instead
+        # (Temporarily disabling those for the super function)
+        original_guidance_scale = generation_config.guidance_scale
+        original_temperature = generation_config.temperature
+        generation_config.guidance_scale = None
+        generation_config.temperature = None
+
+        # Get base processors and those we can integrate easily
+        custom_processors = LogitsProcessorList()
+
+        if original_temperature is not None and original_temperature != 1.0:
+            custom_processors.append(TemperatureLogitsWarper(original_temperature))
+
+        custom_processors.append(
+            DiaEOSChannelFilterLogitsProcessor(
+                num_channels=len(self.config.delay_pattern),
+                eos_token_id=self.config.eos_token_id,
+            )
+        )
+
+        merged_processors = super()._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=encoder_input_ids,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=custom_processors,
+            device=device,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+
+        # Custom processors we need at specific positions
+        if original_guidance_scale is not None and original_guidance_scale != 1:
+            cfg_processor = DiaClassifierFreeGuidanceLogitsProcessor(
+                guidance_scale=original_guidance_scale,
+                guidance_top_k=generation_config.top_k,
+            )
+            merged_processors.insert(0, cfg_processor)
+
+        merged_processors.append(
+            DiaEOSDelayPatternLogitsProcessor(
+                delay_pattern=self.config.delay_pattern,
+                eos_token_id=self.config.eos_token_id,
+                max_generation_len=generation_config.max_length,
+                device=device,
+            )
+        )
+
+        # Enable temporarily disabled values back
+        generation_config.guidance_scale = original_guidance_scale
+        generation_config.temperature = original_temperature
+
+        return merged_processors
+
+    def _prepare_generation_config(
+        self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict
+    ) -> tuple[GenerationConfig, dict]:
+        generation_config, model_kwargs = super()._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+
+        # We allow generation up to max length + max delay pattern
+        # (will revert back to max length after generation)
+        generation_config.max_length += max(self.config.delay_pattern)
+
+        # Internal flag to indicate CFG that needs to prepare unconditioned input
+        self._uses_cfg = generation_config.guidance_scale is not None and generation_config.guidance_scale != 1
+
+        return generation_config, model_kwargs
+
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[torch.Tensor] = None,
+        model_kwargs: Optional[dict[str, torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]:
+        inputs, input_name, model_kwargs = super()._prepare_model_inputs(
+            inputs=inputs,
+            bos_token_id=bos_token_id,
+            model_kwargs=model_kwargs,
+        )
+
+        # If CFG is requested we fill in the unconditioned parts
+        if self._uses_cfg:
+            unconditioned_inputs = torch.zeros_like(inputs)
+            inputs = torch.cat([inputs, unconditioned_inputs], dim=0)
+
+            if model_kwargs.get("attention_mask", None) is not None:
+                model_kwargs["attention_mask"] = model_kwargs["attention_mask"].repeat(2, 1)
+
+        return inputs, input_name, model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: dict[str, torch.Tensor],
+        decoder_start_token_id: torch.Tensor,
+        device: Optional[torch.device] = None,
+    ) -> tuple[torch.LongTensor, dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` and `decoder_attention_mask`; if not error out
+        decoder_input_ids = decoder_attention_mask = None
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        if model_kwargs is not None and "decoder_attention_mask" in model_kwargs:
+            decoder_attention_mask = model_kwargs.pop("decoder_attention_mask")
+
+        # We allow generating without preparation (no proper delay) but discourage it
+        if decoder_input_ids is None or decoder_attention_mask is None:
+            logger.warning_once(
+                "In order to generate with Dia, we need the processed audio input: Got `decoder_input_ids`:"
+                f" {decoder_input_ids is not None} and got `decoder_attention_mask`={decoder_attention_mask is not None}."
+                f" This can be achieved via the [`DiaProcessor`] but now defaulting to non-delayed generation."
+            )
+
+            num_channels = self.config.decoder_config.num_channels
+            real_batch_size = batch_size // 2 if self._uses_cfg else batch_size
+
+            if decoder_input_ids is None:
+                decoder_input_ids = torch.full(
+                    (real_batch_size, 1, num_channels), decoder_start_token_id, dtype=torch.long, device=device
+                )
+
+            decoder_attention_mask = torch.ones(
+                size=(real_batch_size, decoder_input_ids.shape[1]), dtype=torch.long, device=device
+            )
+
+        # 2. Determine the valid input and what works as mask within the input
+        delay_mask = decoder_input_ids.long()
+        valid_input_size = (
+            decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == self.config.pad_token_id).sum(dim=-1).max()
+        )
+        decoder_input_ids = delay_mask[:, :valid_input_size].transpose(1, 2).long()
+        decoder_attention_mask = decoder_attention_mask[:, :valid_input_size].long()
+
+        # 3. Overwrite into model kwargs
+        model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+        model_kwargs["decoder_delay_mask"] = delay_mask
+
+        return decoder_input_ids, model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        encoder_outputs=None,  # Using this to easily get the batch size
+        decoder_delay_mask=None,
+        **kwargs,
+    ):
+        # Reshape decoder input_ids to 3D to be compile friendly and to fit the expected model input shape
+        batch_size = encoder_outputs[0].shape[0] // 2 if self._uses_cfg else encoder_outputs[0].shape[0]
+        input_ids = input_ids.reshape(batch_size, self.config.decoder_config.num_channels, -1).transpose(1, 2)
+
+        # Base method handles most things except CFG and the delay pattern mask
+        model_inputs = super().prepare_inputs_for_generation(input_ids, encoder_outputs=encoder_outputs, **kwargs)
+
+        # Post processing for CFG and overwriting via delay pattern mask
+        # 1. Delay pattern mask -- force tokens if not allowed to predict (!= pad_token in mask)
+        model_inputs["decoder_input_ids"] = self.apply_delay_mask(
+            input_ids, self.config.pad_token_id, decoder_delay_mask
+        )
+
+        # Depending on cache usage we need to pass all or just one
+        if model_inputs.get("use_cache", False) and model_inputs["cache_position"][0] > 0:
+            model_inputs["decoder_input_ids"] = model_inputs["decoder_input_ids"][:, -1, :][:, None, :]
+
+        # Be compile friendly
+        model_inputs["decoder_input_ids"] = model_inputs["decoder_input_ids"].contiguous()
+
+        # 2. Apply CFG duplication if needed
+        if self._uses_cfg:
+            for key in ["decoder_input_ids", "decoder_attention_mask", "decoder_position_ids"]:
+                if model_inputs.get(key, None) is not None:
+                    # double first dimension and keep everything else the same
+                    repeat_pattern = tuple([2] + [1] * (model_inputs[key].ndim - 1))
+                    model_inputs[key] = model_inputs[key].repeat(*repeat_pattern)
+
+        return model_inputs
+
+    @staticmethod
+    def apply_delay_mask(input_ids: torch.Tensor, pad_id: int, delay_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        if delay_mask is None:
+            return input_ids
+
+        mask_len = min(input_ids.shape[1], delay_mask.shape[1])
+        valid_mask = delay_mask[:, :mask_len, :]
+        valid_input = input_ids[:, :mask_len, :]
+
+        # Overwrite the respective parts of the input
+        input_ids[:, :mask_len, :] = torch.where(valid_mask == pad_id, valid_input, valid_mask)
+
+        return input_ids
+
+    def _main_generate_loop(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        use_model_defaults: Optional[bool] = None,
+        custom_generate: Optional[str] = None,
+        **kwargs,
+    ):
+        # ********** mostly taken from main generate function up to calling the different methods (see NOTE) **********
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
+        assistant_tokenizer = kwargs.pop("assistant_tokenizer", None)  # only used for assisted generation
+
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+        self._validate_model_kwargs(model_kwargs.copy())
+        self._validate_assistant(assistant_model, tokenizer, assistant_tokenizer)
+
+        # 2. Set generation parameters if not already defined
+        if synced_gpus is None:
+            synced_gpus = (is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)) and dist.get_world_size() > 1
+
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        # 3. Define model inputs
+        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        device = inputs_tensor.device
+        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device)
+
+        # 4. Define other model kwargs
+        if "encoder_outputs" not in model_kwargs:
+            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, model_kwargs, model_input_name, generation_config
+            )
+
+        # 5. Prepare `input_ids` which will be used for auto-regressive generation
+        input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+            batch_size=batch_size,
+            model_input_name=model_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=generation_config._decoder_start_token_tensor,
+            device=inputs_tensor.device,
+        )
+
+        if generation_config.token_healing:
+            input_ids = self.heal_tokens(input_ids, tokenizer)
+
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+
+        # 6. Prepare `max_length` depending on other stopping criteria.
+        # NOTE: incorrect `input_ids.shape[1]` previously
+        input_ids_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            has_default_min_length=has_default_min_length,
+            model_input_name=model_input_name,
+            inputs_tensor=inputs_tensor,
+            input_ids_length=input_ids_length,
+        )
+
+        # If the model supports `logits_to_keep` in forward(), set it to 1 to avoid computing the whole
+        # logit matrix. This can save a lot of memory during the first forward pass. Note that assisted decoding
+        # dynamically overrides this value as it can need more than the last token logits
+        if self._supports_logits_to_keep() and "logits_to_keep" not in model_kwargs:
+            model_kwargs["logits_to_keep"] = 1
+
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
+
+        # 7. Prepare the cache.
+        # - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`.
+        # - different models have a different cache name expected by the model (default = "past_key_values")
+        # - `max_length`, prepared above, is used to determine the maximum cache length
+        max_cache_length = generation_config.max_length - 1
+        if (
+            inputs_tensor.shape[1] != input_ids_length
+            and model_input_name == "inputs_embeds"
+            and not self.config.is_encoder_decoder
+        ):
+            max_cache_length += inputs_tensor.shape[1]
+        self._prepare_cache_for_generation(
+            generation_config, model_kwargs, assistant_model, batch_size, max_cache_length
+        )
+
+        # 8. determine generation mode
+        generation_mode = generation_config.get_generation_mode(assistant_model)
+
+        if streamer is not None and (generation_config.num_beams > 1):
+            raise ValueError(
+                "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
+            )
+
+        # 9. prepare logits processors and stopping criteria
+        prepared_logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+            device=inputs_tensor.device,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+        prepared_stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
+        )
+
+        # Set model_kwargs `use_cache` so we can use it later in forward runs
+        model_kwargs["use_cache"] = generation_config.use_cache
+        # ******************* taken from main generate function up to calling the different methods *******************
+
+        # Prepare inner 2D logic in generation loop
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1])
+
+        # 10. go into different generation modes
+        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
+            # 11. expand input_ids with `num_return_sequences` additional sequences per batch
+            if generation_config.num_return_sequences > 1:
+                raise ValueError("`num_return_sequences>1` is incompatible with Dia.")
+
+            # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
+            return self._sample(
+                input_ids,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        use_model_defaults: Optional[bool] = None,
+        custom_generate: Optional[str] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        # We expect the initial input ids to be the complete mask (delayed input)
+        delay_mask = kwargs.get("decoder_input_ids")
+        if delay_mask is not None:
+            delay_mask = delay_mask.clone()
+
+        output = self._main_generate_loop(
+            inputs=inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            use_model_defaults=use_model_defaults,
+            custom_generate=custom_generate,
+            **kwargs,
+        )
+
+        return_dict_in_generate = not isinstance(output, torch.Tensor)
+
+        if return_dict_in_generate:
+            output_sequences = output.sequences
+        else:
+            output_sequences = output
+
+        # Reshape from 2D (bsz * channels, seq_len) to 3D (bsz, seq_len, channels)
+        num_channels = self.config.decoder_config.num_channels
+        bsz = output_sequences.shape[0] // num_channels
+        output_sequences = output_sequences.reshape(bsz, num_channels, -1).transpose(1, 2)
+
+        # Apply delay mask
+        output_sequences = self.apply_delay_mask(output_sequences, self.config.pad_token_id, delay_mask)
+
+        if return_dict_in_generate:
+            output.sequences = output_sequences
+        else:
+            output = output_sequences
+
+        return output
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/modeling_dia.py b/phivenv/Lib/site-packages/transformers/models/dia/modeling_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf662b224aabd884521d7e14b8a167886377f4b5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/modeling_dia.py
@@ -0,0 +1,958 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dia/modular_dia.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dia.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    is_torchdynamo_compiling,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_dia import DiaConfig, DiaDecoderConfig, DiaEncoderConfig
+from .generation_dia import DiaGenerationMixin
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class DiaPreTrainedModel(PreTrainedModel):
+    config: DiaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    main_input_name = "input_ids"
+    _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
+
+
+class DiaMultiChannelEmbedding(nn.Module):
+    """In order to efficiently compute the audio embedding from the 9 different channels,
+    we vectorize the embedding process by using a single embedding layer and an offset.
+    Example:
+    - num_embeds = 4
+    - vocab_size = 8
+    - num_channels = 3
+    We would have offsets = [0, 8, 16]
+    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
+    then tokens = audio_codes + offsets
+                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
+    This allows us to use a single embedding layer for all channels.
+    """
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__()
+        self.embed = nn.Embedding(config.vocab_size * config.num_channels, config.hidden_size)
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+        offsets = torch.arange(config.num_channels, dtype=torch.long) * config.vocab_size  # (C,)
+        self.register_buffer("offsets", offsets, persistent=False)
+
+    def forward(self, audio_codes: torch.Tensor) -> torch.Tensor:
+        tokens = (audio_codes + self.offsets.to(audio_codes.device)).squeeze(1)
+        embeds = self.embed(tokens).view(tokens.shape[0], audio_codes.shape[1], -1, self.hidden_size)
+        return embeds.sum(dim=2)
+
+
+class DiaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DiaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DiaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class DiaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: DiaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class DiaSelfAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads or self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads)
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = is_causal
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiaCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.cross_hidden_size = config.cross_hidden_size
+        self.num_heads = self.config.cross_num_attention_heads
+        self.num_key_value_heads = self.config.cross_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = config.cross_head_dim
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        cross_shape = (*cross_attention_states.shape[:-1], -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False
+        if past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_values.cross_attention_cache.layers[self.layer_idx].keys
+            value_states = past_key_values.cross_attention_cache.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+            value_states = self.v_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all states to the cache
+                key_states, value_states = past_key_values.cross_attention_cache.update(
+                    key_states,
+                    value_states,
+                    self.layer_idx,
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape((*input_shape, -1)).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiaEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaEncoderConfig, layer_idx: int):
+        super().__init__()
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=False)
+        self.post_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.post_sa_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights
+
+
+class DiaEncoder(DiaPreTrainedModel):
+    def __init__(self, config: DiaEncoderConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[BaseModelOutput, tuple]:
+        hidden_states = self.embedding(input_ids)
+
+        # RoPE
+        # Note: We expect right padding and hence always generate
+        # the position ids on the fly to reduce preparation overhead
+        position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :]
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states += (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+class DiaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=True)
+        self.cross_attention = DiaCrossAttention(config, layer_idx)
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_ca_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_mlp_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        self_attn_cache = past_key_values
+        if isinstance(self_attn_cache, EncoderDecoderCache):
+            self_attn_cache = self_attn_cache.self_attention_cache
+
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings,
+            attention_mask,
+            # Needs to be an arg in order to function properly
+            # on inplace operations to be carried (e.g. compile)
+            self_attn_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.pre_ca_norm(hidden_states)
+        cross_states, cross_attn_weights = self.cross_attention(
+            normed_states,
+            encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        hidden_states = residual + cross_states
+
+        residual = hidden_states
+        normed_states = self.pre_mlp_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights, cross_attn_weights
+
+
+class DiaDecoder(DiaPreTrainedModel):
+    """Transformer Decoder Stack using DenseGeneral."""
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__(config)
+        self.num_channels = config.num_channels
+        self.vocab_size = config.vocab_size
+        self.embeddings = DiaMultiChannelEmbedding(config)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+        self.layers = nn.ModuleList(
+            [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
+            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+
+        batch_size, seq_length = input_ids.size()[:-1]
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=input_ids.device
+            )
+        if position_ids is None:
+            position_ids = cache_position[None, :]
+
+        # RoPE
+        hidden_states = self.embeddings(input_ids)
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=input_ids.device)
+
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            hidden_states.shape[:2],
+            hidden_states,
+        )
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                position_embeddings,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns = all_self_attns + (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Dia model outputting raw hidden-states without any specific head on top.
+    """
+)
+class DiaModel(DiaPreTrainedModel):
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DiaEncoder(config.encoder_config)
+        self.decoder = DiaDecoder(config.decoder_config)
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        """
+
+        if input_ids is None and encoder_outputs is None:
+            raise ValueError(
+                "You should either provide text ids or the cached text encodings. Neither has been found."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if self.is_gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                **kwargs,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput
+        elif not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # On default we initialize the decoder with bos tokens if nothing has been provided
+        bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels)
+        if decoder_input_ids is None:
+            decoder_input_ids = torch.full(
+                size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device
+            )
+        # Ensure 3D
+        if decoder_input_ids.ndim == 2:
+            decoder_input_ids = decoder_input_ids.reshape(bsz, channels, seq_len).transpose(1, 2)
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            position_ids=decoder_position_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs[0],
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
+    """
+)
+class DiaForConditionalGeneration(DiaPreTrainedModel, DiaGenerationMixin):
+    base_model_prefix = "model"
+
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = DiaModel(config)
+
+        self.num_channels = config.decoder_config.num_channels
+        self.vocab_size = config.decoder_config.vocab_size
+        self.logits_dense = nn.Linear(
+            config.decoder_config.hidden_size, (self.num_channels * self.vocab_size), bias=False
+        )
+        self.loss_type = "ForMaskedLM"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in
+            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
+            are ignored (masked).
+        """
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_position_ids=decoder_position_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        last_hidden_state = outputs[0]
+        batch_size = last_hidden_state.shape[0]
+        # 3D <-> 2D makes it necessary to prioritize channel dim
+        audio_logits = (
+            self.logits_dense(last_hidden_state)
+            .view((batch_size, -1, self.num_channels, self.vocab_size))
+            .transpose(1, 2)
+            .contiguous()
+            .view(batch_size * self.num_channels, -1, self.vocab_size)
+        )
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=audio_logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=audio_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+__all__ = ["DiaModel", "DiaPreTrainedModel", "DiaForConditionalGeneration"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/modular_dia.py b/phivenv/Lib/site-packages/transformers/models/dia/modular_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99d32a01d9cffab79bd72852d776447e084681b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/modular_dia.py
@@ -0,0 +1,773 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Dia model."""
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import DynamicCache, EncoderDecoderCache
+from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    eager_attention_forward,
+)
+from ..phi3.modeling_phi3 import Phi3MLP
+from .configuration_dia import DiaConfig, DiaDecoderConfig, DiaEncoderConfig
+from .generation_dia import DiaGenerationMixin
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class DiaPreTrainedModel(PreTrainedModel):
+    config: DiaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    main_input_name = "input_ids"
+    _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
+
+
+class DiaMultiChannelEmbedding(nn.Module):
+    """In order to efficiently compute the audio embedding from the 9 different channels,
+    we vectorize the embedding process by using a single embedding layer and an offset.
+    Example:
+    - num_embeds = 4
+    - vocab_size = 8
+    - num_channels = 3
+    We would have offsets = [0, 8, 16]
+    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
+    then tokens = audio_codes + offsets
+                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
+    This allows us to use a single embedding layer for all channels.
+    """
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__()
+        self.embed = nn.Embedding(config.vocab_size * config.num_channels, config.hidden_size)
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+        offsets = torch.arange(config.num_channels, dtype=torch.long) * config.vocab_size  # (C,)
+        self.register_buffer("offsets", offsets, persistent=False)
+
+    def forward(self, audio_codes: torch.Tensor) -> torch.Tensor:
+        tokens = (audio_codes + self.offsets.to(audio_codes.device)).squeeze(1)
+        embeds = self.embed(tokens).view(tokens.shape[0], audio_codes.shape[1], -1, self.hidden_size)
+        return embeds.sum(dim=2)
+
+
+class DiaMLP(Phi3MLP):
+    pass
+
+
+class DiaRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class DiaRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class DiaSelfAttention(LlamaAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False):
+        nn.Module.__init__(self)
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads or self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads)
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = is_causal
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+
+class DiaCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.cross_hidden_size = config.cross_hidden_size
+        self.num_heads = self.config.cross_num_attention_heads
+        self.num_key_value_heads = self.config.cross_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = config.cross_head_dim
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        cross_shape = (*cross_attention_states.shape[:-1], -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False
+        if past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_values.cross_attention_cache.layers[self.layer_idx].keys
+            value_states = past_key_values.cross_attention_cache.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+            value_states = self.v_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all states to the cache
+                key_states, value_states = past_key_values.cross_attention_cache.update(
+                    key_states,
+                    value_states,
+                    self.layer_idx,
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape((*input_shape, -1)).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiaEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaEncoderConfig, layer_idx: int):
+        super().__init__()
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=False)
+        self.post_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.post_sa_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights
+
+
+class DiaEncoder(DiaPreTrainedModel):
+    def __init__(self, config: DiaEncoderConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[BaseModelOutput, tuple]:
+        hidden_states = self.embedding(input_ids)
+
+        # RoPE
+        # Note: We expect right padding and hence always generate
+        # the position ids on the fly to reduce preparation overhead
+        position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :]
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states += (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+class DiaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=True)
+        self.cross_attention = DiaCrossAttention(config, layer_idx)
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_ca_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_mlp_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        self_attn_cache = past_key_values
+        if isinstance(self_attn_cache, EncoderDecoderCache):
+            self_attn_cache = self_attn_cache.self_attention_cache
+
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings,
+            attention_mask,
+            # Needs to be an arg in order to function properly
+            # on inplace operations to be carried (e.g. compile)
+            self_attn_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.pre_ca_norm(hidden_states)
+        cross_states, cross_attn_weights = self.cross_attention(
+            normed_states,
+            encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        hidden_states = residual + cross_states
+
+        residual = hidden_states
+        normed_states = self.pre_mlp_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights, cross_attn_weights
+
+
+class DiaDecoder(DiaPreTrainedModel):
+    """Transformer Decoder Stack using DenseGeneral."""
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__(config)
+        self.num_channels = config.num_channels
+        self.vocab_size = config.vocab_size
+        self.embeddings = DiaMultiChannelEmbedding(config)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+        self.layers = nn.ModuleList(
+            [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
+            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+
+        batch_size, seq_length = input_ids.size()[:-1]
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=input_ids.device
+            )
+        if position_ids is None:
+            position_ids = cache_position[None, :]
+
+        # RoPE
+        hidden_states = self.embeddings(input_ids)
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=input_ids.device)
+
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            hidden_states.shape[:2],
+            hidden_states,
+        )
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                position_embeddings,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns = all_self_attns + (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Dia model outputting raw hidden-states without any specific head on top.
+    """
+)
+class DiaModel(DiaPreTrainedModel):
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DiaEncoder(config.encoder_config)
+        self.decoder = DiaDecoder(config.decoder_config)
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        """
+
+        if input_ids is None and encoder_outputs is None:
+            raise ValueError(
+                "You should either provide text ids or the cached text encodings. Neither has been found."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if self.is_gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                **kwargs,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput
+        elif not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # On default we initialize the decoder with bos tokens if nothing has been provided
+        bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels)
+        if decoder_input_ids is None:
+            decoder_input_ids = torch.full(
+                size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device
+            )
+        # Ensure 3D
+        if decoder_input_ids.ndim == 2:
+            decoder_input_ids = decoder_input_ids.reshape(bsz, channels, seq_len).transpose(1, 2)
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            position_ids=decoder_position_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs[0],
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
+    """
+)
+class DiaForConditionalGeneration(DiaPreTrainedModel, DiaGenerationMixin):
+    base_model_prefix = "model"
+
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = DiaModel(config)
+
+        self.num_channels = config.decoder_config.num_channels
+        self.vocab_size = config.decoder_config.vocab_size
+        self.logits_dense = nn.Linear(
+            config.decoder_config.hidden_size, (self.num_channels * self.vocab_size), bias=False
+        )
+        self.loss_type = "ForMaskedLM"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in
+            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
+            are ignored (masked).
+        """
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_position_ids=decoder_position_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        last_hidden_state = outputs[0]
+        batch_size = last_hidden_state.shape[0]
+        # 3D <-> 2D makes it necessary to prioritize channel dim
+        audio_logits = (
+            self.logits_dense(last_hidden_state)
+            .view((batch_size, -1, self.num_channels, self.vocab_size))
+            .transpose(1, 2)
+            .contiguous()
+            .view(batch_size * self.num_channels, -1, self.vocab_size)
+        )
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=audio_logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=audio_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+__all__ = ["DiaModel", "DiaPreTrainedModel", "DiaForConditionalGeneration"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/processing_dia.py b/phivenv/Lib/site-packages/transformers/models/dia/processing_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..402f5152a64bda378ccdf5edd512c86fe643145c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/processing_dia.py
@@ -0,0 +1,474 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for Dia"""
+
+import math
+from pathlib import Path
+from typing import Optional, Union
+
+from ...audio_utils import AudioInput, make_list_of_audio
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...utils import is_soundfile_available, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+if is_soundfile_available():
+    import soundfile as sf
+
+
+class DiaAudioKwargs(AudioKwargs, total=False):
+    bos_token_id: int
+    eos_token_id: int
+    pad_token_id: int
+    delay_pattern: list[int]
+    generation: bool
+
+
+class DiaProcessorKwargs(ProcessingKwargs, total=False):
+    audio_kwargs: DiaAudioKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": True,
+            "padding_side": "right",
+            "add_special_tokens": False,
+        },
+        "audio_kwargs": {
+            "eos_token_id": 1024,
+            "pad_token_id": 1025,
+            "bos_token_id": 1026,
+            "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15],
+            "generation": True,
+            "sampling_rate": 44100,
+        },
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
+class DiaProcessor(ProcessorMixin):
+    r"""
+    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
+    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
+    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
+    information.
+
+    Args:
+        feature_extractor (`DiaFeatureExtractor`):
+            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`DiaTokenizer`):
+            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
+        audio_tokenizer (`DacModel`):
+            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
+    """
+
+    feature_extractor_class = "DiaFeatureExtractor"
+    tokenizer_class = "DiaTokenizer"
+    audio_tokenizer_class = "DacModel"
+
+    def __init__(self, feature_extractor, tokenizer, audio_tokenizer):
+        super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer)
+
+    def __call__(
+        self,
+        text: Union[str, list[str]],
+        audio: Optional[AudioInput] = None,
+        output_labels: Optional[bool] = False,
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ):
+        """
+        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
+        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
+        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
+        to the docstring of the above methods for more information.
+        """
+        if not is_torch_available():
+            raise ValueError(
+                "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't "
+                "find it in your environment. You can install torch via `pip install torch`."
+            )
+
+        if text is None:
+            raise ValueError("You need to specify the `text` input to process.")
+
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+
+        text_kwargs = output_kwargs["text_kwargs"]
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+
+        return_tensors = common_kwargs.pop("return_tensors", None)
+        if return_tensors != "pt":
+            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
+
+        data = {}
+
+        # Text
+        if isinstance(text, str):
+            text = [text]
+        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        encodings = self.tokenizer(text, **text_kwargs)
+        data.update(encodings)
+
+        # Audio
+        delay_pattern = audio_kwargs.pop("delay_pattern", None)
+        audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)
+        audio_eos_token_id = audio_kwargs.pop("eos_token_id", None)
+        audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)
+        generation = audio_kwargs.pop("generation", True)
+        if (
+            audio_bos_token_id is None
+            or audio_eos_token_id is None
+            or audio_pad_token_id is None
+            or delay_pattern is None
+        ):
+            raise ValueError(
+                "To enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, "
+                "`pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those."
+            )
+
+        if generation and output_labels:
+            raise ValueError(
+                f"Labels with `generation` is incompatible, got generation={generation}, output_labels={output_labels}."
+            )
+
+        batch_size = data["input_ids"].shape[0]
+        num_channels = len(delay_pattern)
+        max_delay = max(delay_pattern)
+
+        # Voice cloning generation / general training
+        if audio is not None:
+            audio = make_list_of_audio(audio)
+            input_audios = self.feature_extractor(audio, **audio_kwargs)
+
+            compression_rate = math.prod(self.audio_tokenizer.config.downsampling_ratios)
+            max_encoded_sequence_len = input_audios["padding_mask"][0].shape[-1] // compression_rate
+
+            decoder_input_ids = []
+            decoder_attention_mask = []
+            # TODO: dac with batching is currently broken, but non-batch is working
+            # refer to https://gist.github.com/vasqu/643a45b680cf39fd7467271ee2eb6f80 for a validation script
+            for padding_mask, audio in zip(input_audios["padding_mask"], input_audios["input_values"]):
+                # get current length with hop length in mind (as if it were sampled as a single audio)
+                base_pad_len = self.feature_extractor.hop_length
+                current_audio_len = math.ceil(padding_mask.sum(dim=-1) / base_pad_len) * base_pad_len
+
+                encoded_sequence_len = current_audio_len // compression_rate
+                padding_len = max_encoded_sequence_len - encoded_sequence_len
+
+                # compute non-padded forward pass; one extra bos (and eos if training) is added
+                with torch.no_grad():
+                    audio = audio[None, ..., :current_audio_len].to(self.audio_tokenizer.device)
+                    input_ids = self.audio_tokenizer.encode(audio).audio_codes.transpose(1, 2)
+
+                if not generation:
+                    input_ids = torch.nn.functional.pad(
+                        input_ids, pad=(0, 0, 0, 1, 0, 0), mode="constant", value=audio_eos_token_id
+                    )
+
+                # apply padding
+                # +1 for the bos within the real sequence
+                input_ids = torch.nn.functional.pad(
+                    input_ids, pad=(0, 0, padding_len + 1, 0, 0, 0), mode="constant", value=audio_bos_token_id
+                )
+                num_valid_inputs = encoded_sequence_len + 1 + max_delay  # sequence + bos + delay
+                num_valid_inputs += 0 if generation else 1  # eos if training
+                attention_mask = torch.tensor([0] * padding_len + [1] * num_valid_inputs, dtype=torch.long)[None, :]
+
+                decoder_input_ids.append(input_ids)
+                decoder_attention_mask.append(attention_mask)
+
+            decoder_input_ids = torch.cat(decoder_input_ids, dim=0)
+            decoder_attention_mask = torch.cat(decoder_attention_mask, dim=0)
+        # TTS generation
+        elif generation:
+            # all bos to start with TTS
+            decoder_input_ids = torch.full((batch_size, 1, num_channels), audio_bos_token_id, dtype=torch.long)
+
+            # we preemptively add the delay
+            decoder_attention_mask = torch.ones(size=(batch_size, 1 + max_delay), dtype=torch.long)
+        else:
+            raise ValueError("If you try to train, you should provide audio data as well.")
+
+        if batch_size != decoder_input_ids.shape[0]:
+            raise ValueError(
+                f"Need the same amount of samples for both text and audio, but got text samples={batch_size} and "
+                f"audio samples = {decoder_input_ids.shape[0]} instead."
+            )
+
+        # prepare shift indices per delay
+        max_seq_len = decoder_attention_mask.shape[-1]
+        max_audio_len = max_seq_len - max_delay
+        precomputed_idx = self.build_indices(
+            bsz=batch_size,
+            seq_len=max_seq_len,
+            num_channels=num_channels,
+            delay_pattern=delay_pattern,
+            revert=False,
+        )
+
+        # create delay pattern input
+        # the pad token will be used for masking which input is valid for prediction during generation
+        prefill = torch.full(
+            (batch_size, max_seq_len, num_channels),
+            fill_value=audio_pad_token_id,
+            dtype=torch.int,
+        )
+        prefill[:, :max_audio_len] = decoder_input_ids
+
+        delayed_decoder_input_ids = self.apply_audio_delay(
+            audio=prefill,
+            pad_token_id=audio_pad_token_id,
+            bos_token_id=audio_bos_token_id,
+            precomputed_idx=precomputed_idx,
+        )
+
+        data.update({"decoder_input_ids": delayed_decoder_input_ids, "decoder_attention_mask": decoder_attention_mask})
+
+        if output_labels:
+            # Base idea is to shift on the sequence dim
+            labels = data["decoder_input_ids"].clone()[:, 1:]
+            labels[labels == audio_pad_token_id] = -100
+            labels[labels == audio_bos_token_id] = -100
+
+            data["labels"] = labels.transpose(1, 2).reshape(batch_size * num_channels, -1).contiguous().long()
+            data["decoder_input_ids"] = data["decoder_input_ids"][:, :-1]
+            data["decoder_attention_mask"] = data["decoder_attention_mask"][:, :-1]
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def batch_decode(
+        self,
+        decoder_input_ids: "torch.Tensor",
+        audio_prompt_len: Optional[int] = None,
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ) -> list["torch.Tensor"]:
+        """
+        Decodes a batch of audio codebook sequences into their respective audio waveforms via the
+        `audio_tokenizer`. See [`~DacModel.decode`] for more information.
+
+        Args:
+            decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
+            audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
+        """
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+        audio_kwargs = output_kwargs["audio_kwargs"]
+
+        delay_pattern = audio_kwargs.pop("delay_pattern", None)
+        audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)
+        audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)
+        if audio_bos_token_id is None or audio_pad_token_id is None or delay_pattern is None:
+            raise ValueError(
+                "To enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, "
+                "and `delay_pattern`. You may have accidentally overwritten one of those."
+            )
+
+        # either decode the whole audio sequence or only the generated parts
+        if audio_prompt_len is not None:
+            audio_prompt_len = torch.tensor(audio_prompt_len, device=decoder_input_ids.device, dtype=torch.long)
+            start_of_generation_idx = audio_prompt_len[None].expand(decoder_input_ids.shape[0])
+        else:
+            start_of_generation_idx = (decoder_input_ids[:, :, 0] == audio_bos_token_id).sum(dim=-1)
+        # -1 for the eos token
+        end_of_generation_idx = (
+            decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == audio_pad_token_id).sum(dim=-1) - 1
+        )
+
+        # revert delay
+        bsz, seq_len, num_channels = decoder_input_ids.shape
+        precomputed_idx = self.build_indices(
+            bsz=bsz,
+            seq_len=seq_len,
+            num_channels=num_channels,
+            delay_pattern=delay_pattern,
+            revert=True,
+        )
+
+        output_sequences = self.apply_audio_delay(
+            audio=decoder_input_ids,
+            # We do not care about these values as we cut them out
+            # with `start_of_generation_idx` and `end_of_generation_idx`
+            pad_token_id=-1,
+            bos_token_id=-1,
+            precomputed_idx=precomputed_idx,
+        ).transpose(1, 2)
+
+        # retrieve the correct sequences each
+        audios = []
+        # TODO: see above, dac doesn't work in batches yet
+        with torch.no_grad():
+            for i in range(start_of_generation_idx.shape[0]):
+                output_i = output_sequences[i, :, start_of_generation_idx[i] : end_of_generation_idx[i]][None, ...]
+                output_i = output_i.to(self.audio_tokenizer.device)
+                audio_i = self.audio_tokenizer.decode(audio_codes=output_i).audio_values.cpu().squeeze()
+                audios.append(audio_i)
+
+        return audios
+
+    def decode(
+        self,
+        decoder_input_ids: "torch.Tensor",
+        audio_prompt_len: Optional[int] = None,
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ) -> "torch.Tensor":
+        """
+        Decodes a single sequence of audio codebooks into the respective audio waveform via the
+        `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
+        """
+        if decoder_input_ids.shape[0] != 1:
+            raise ValueError(
+                f"Expecting a single output to be decoded but received {decoder_input_ids.shape[0]} samples instead."
+            )
+
+        return self.batch_decode(decoder_input_ids, audio_prompt_len, **kwargs)[0]
+
+    def get_audio_prompt_len(
+        self,
+        decoder_attention_mask: "torch.Tensor",
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ) -> int:
+        """Utility function to get the audio prompt length."""
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+        audio_kwargs = output_kwargs["audio_kwargs"]
+
+        delay_pattern = audio_kwargs.pop("delay_pattern", None)
+        if delay_pattern is None:
+            raise ValueError(
+                "To enable the utility of retrieving the prompt length for Dia, we need the "
+                "`delay_pattern`. You may have accidentally overwritten this."
+            )
+        return decoder_attention_mask.shape[1] - max(delay_pattern)
+
+    # Copied from transformers.models.csm.processing_csm.CsmProcessor.save_audio with Csm->Dia
+    def save_audio(
+        self,
+        audio: AudioInput,
+        saving_path: Union[str, Path, list[Union[str, Path]]],
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ):
+        # TODO: @eustlb, this should be in AudioProcessor
+        if not is_soundfile_available():
+            raise ImportError("Please install `soundfile` to save audio files.")
+
+        # ensure correct audio input
+        audio = make_list_of_audio(audio)
+
+        # ensure correct saving path
+        if isinstance(saving_path, (str, Path)):
+            saving_path = [saving_path]
+        elif not (isinstance(saving_path, (list, tuple)) and all(isinstance(p, (str, Path)) for p in saving_path)):
+            raise ValueError("Invalid input path. Please provide a string, or a list of strings")
+
+        if len(audio) != len(saving_path):
+            raise ValueError("The number of audio and saving paths must be the same")
+
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        sampling_rate = audio_kwargs["sampling_rate"]
+
+        for audio_value, p in zip(audio, saving_path):
+            if isinstance(audio_value, torch.Tensor):
+                audio_value = audio_value.cpu().float().numpy()
+            sf.write(p, audio_value, sampling_rate)
+
+    @staticmethod
+    def build_indices(
+        bsz: int,
+        seq_len: int,
+        num_channels: int,
+        delay_pattern: list[int],
+        revert: bool = False,
+    ) -> tuple["torch.Tensor", "torch.Tensor"]:
+        """
+        Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
+        or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
+        Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
+        """
+        delay_array = torch.tensor(delay_pattern, dtype=torch.int32)
+
+        # (0..seq_len-1)
+        sequence_idx = torch.arange(seq_len, dtype=torch.int32)[None, :].expand(bsz, seq_len)[..., None]
+        # + or - delay depending if we delay or revert the delay
+        if not revert:
+            sequence_idx = sequence_idx - delay_array[None, None, :]
+        else:
+            sequence_idx = sequence_idx + delay_array[None, None, :]
+        # if delay goes over the range we clamp back to valid values
+        valid_sequence_idx = torch.clamp(sequence_idx, 0, seq_len - 1)
+
+        batch_idx = torch.arange(bsz, dtype=torch.int32)[:, None, None].expand(bsz, seq_len, num_channels)
+        channel_idx = torch.arange(num_channels, dtype=torch.int32)[None, None, :].expand(bsz, seq_len, num_channels)
+
+        all_idx = torch.stack(
+            [batch_idx.reshape(-1), valid_sequence_idx.reshape(-1), channel_idx.reshape(-1)],
+            dim=1,
+        ).long()
+
+        return sequence_idx, all_idx
+
+    @staticmethod
+    def apply_audio_delay(
+        audio: "torch.Tensor",
+        pad_token_id: int,
+        bos_token_id: int,
+        precomputed_idx: tuple["torch.Tensor", "torch.Tensor"],
+    ) -> "torch.Tensor":
+        """
+        Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
+        inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.
+
+        Args:
+            audio: audio tokens of shape [bsz, seq_len, num_channels]
+            pad_token_id: the PAD token
+            bos_token_id: the BOS token
+            precomputed_idx: from `build_indices`
+
+        Returns:
+            final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
+        """
+        # Move everything to the same device
+        device = audio.device
+        sequence_idx, all_idx = precomputed_idx
+        sequence_idx = sequence_idx.to(device)
+        all_idx = all_idx.to(device)
+
+        # Gather per precomputed indices
+        batch_idx, valid_sequence_idx, channel_idx = torch.unbind(all_idx, dim=-1)
+        gathered_audio = audio[batch_idx, valid_sequence_idx, channel_idx].view(audio.size())
+
+        # Mask according to negative sequence_idx => BOS; sequence_idx >= seq_len => PAD
+        mask_bos = sequence_idx < 0
+        mask_pad = sequence_idx >= audio.shape[1]
+        final_audio = torch.where(mask_bos, bos_token_id, torch.where(mask_pad, pad_token_id, gathered_audio))
+
+        return final_audio
+
+
+__all__ = ["DiaProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dia/tokenization_dia.py b/phivenv/Lib/site-packages/transformers/models/dia/tokenization_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e205906ea709ee2c20f25b0bf6f4fa66ab1f4a4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dia/tokenization_dia.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Dia."""
+
+from typing import Optional
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        unk_token (`str`, *optional*, defaults to `"<pad>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        max_length (`int`, *optional*, defaults to 1024):
+            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
+        offset (`int`, *optional*, defaults to 0):
+            The offset of the tokenizer.
+    """
+
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        pad_token: Optional[str] = "<pad>",
+        unk_token: Optional[str] = "<pad>",
+        max_length: Optional[int] = 1024,
+        offset: int = 0,
+        **kwargs,
+    ):
+        # We have no eos/bos tokens but allow padding -- no l/r strip as we treat them as tokens as well
+        pad_token = AddedToken(pad_token) if isinstance(pad_token, str) else pad_token
+        unk_token = AddedToken(unk_token) if isinstance(unk_token, str) else unk_token
+
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+        self._added_tokens_decoder = {0: pad_token, 1: AddedToken("[S1]"), 2: AddedToken("[S2]")}
+        self.offset = offset
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            max_length=max_length,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return self._utf_vocab_size
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = [chr(i) for i in text.encode("utf-8")]
+        return tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+
+        if len(token) != 1:
+            token_id = None
+        else:
+            token_id = ord(token) + self.offset
+
+        return token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = chr(index - self.offset)
+        return token
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        """Converts a sequence of tokens (string) in a single string."""
+        bstring = b""
+        for token in tokens:
+            if token in self.added_tokens_decoder:
+                added_token_obj = self.added_tokens_decoder[token]
+                tok_string = str(added_token_obj).encode("utf-8")
+            elif token in self.added_tokens_encoder:
+                tok_string = token.encode("utf-8")
+            else:
+                tok_string = token.encode("utf-8")  # Assume general string token
+            bstring += tok_string
+        string = bstring.decode("utf-8", errors="ignore")
+        return string
+
+    # No vocab file
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        return ()
+
+
+__all__ = ["DiaTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dialogpt/__init__.py b/phivenv/Lib/site-packages/transformers/models/dialogpt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phivenv/Lib/site-packages/transformers/models/dialogpt/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dialogpt/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b06d6ad9f14488f0a726ab69a8a9248449aab98
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dialogpt/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/__init__.py b/phivenv/Lib/site-packages/transformers/models/diffllama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c162fce0a48bd164bd0e0a615b942ee4805a12aa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/diffllama/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_diffllama import *
+    from .modeling_diffllama import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03f69bd1ed8d0bb58a459e810ff3fa9705115061
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/configuration_diffllama.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/configuration_diffllama.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d3efac68a74c72a036a9a40336dd0349f1c9e61
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/configuration_diffllama.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/modeling_diffllama.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/modeling_diffllama.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3283f4912c39ac21d45c82cbbfd54507cf0793b2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/modeling_diffllama.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/modular_diffllama.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/modular_diffllama.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3724f97bf4bc12dbed158ffcc072d58f6577a9aa
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/diffllama/__pycache__/modular_diffllama.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/configuration_diffllama.py b/phivenv/Lib/site-packages/transformers/models/diffllama/configuration_diffllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..210607271927ab2f3a7aa1ec1e874fb296c32a73
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/diffllama/configuration_diffllama.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DiffLlama model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class DiffLlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults
+    will yield a similar configuration to that of the [kajuma/DiffLlama-0.3B-handcut](https://huggingface.co/kajuma/DiffLlama-0.3B-handcut).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the DiffLlama model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiffLlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 16):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'diffllama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'diffllama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        lambda_std_dev (`float`, *optional*, defaults to 0.1):
+            The standard deviation for initialization of parameter lambda in attention layer.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+
+    ```python
+    >>> from transformers import DiffLlamaModel, DiffLlamaConfig
+
+    >>> # Initializing a DiffLlama diffllama-7b style configuration
+    >>> configuration = DiffLlamaConfig()
+
+    >>> # Initializing a model from the diffllama-7b style configuration
+    >>> model = DiffLlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "diffllama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=16,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        lambda_std_dev=0.1,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.lambda_std_dev = lambda_std_dev
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["DiffLlamaConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/modeling_diffllama.py b/phivenv/Lib/site-packages/transformers/models/diffllama/modeling_diffllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..094cc375057f71eb51644bf2b49c524613ed22e1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/diffllama/modeling_diffllama.py
@@ -0,0 +1,767 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/diffllama/modular_diffllama.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_diffllama.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
+from ...modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_diffllama import DiffLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiffLlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def lambda_init_fn(layer_idx):
+    return 0.8 - 0.6 * math.exp(-0.3 * layer_idx)
+
+
+class DiffLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        # under this are not used
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.lambda_init = lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, target_len, _ = hidden_states.size()
+        q_len = target_len
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiffLlamaFlashAttention2(DiffLlamaAttention):
+    """
+    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, None]:
+        if isinstance(past_key_values, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DiffLlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        value_states1, value_states2 = torch.chunk(value_states, 2, dim=2)
+        value_states1 = value_states1.repeat(1, 1, 2, 1)
+        value_states2 = value_states2.repeat(1, 1, 2, 1)
+
+        attn_output1 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states1,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output2 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states2,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = torch.cat([attn_output1, attn_output2], dim=-1)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+class DiffLlamaSdpaAttention(DiffLlamaAttention):
+    """
+    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from DiffLlamaAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = causal_mask is None and q_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DiffLlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DiffLlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+DIFFLLAMA_ATTENTION_CLASSES = {
+    "eager": DiffLlamaAttention,
+    "flash_attention_2": DiffLlamaFlashAttention2,
+    "sdpa": DiffLlamaSdpaAttention,
+}
+
+
+class DiffLlamaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiffLlamaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = DiffLlamaMLP(config)
+        self.input_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class DiffLlamaPreTrainedModel(PreTrainedModel):
+    config: DiffLlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DiffLlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = False
+    _can_record_outputs = {
+        "hidden_states": DiffLlamaDecoderLayer,
+        "attentions": DiffLlamaAttention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, DiffLlamaAttention):
+            module.lambda_q1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_q2.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k2.data.normal_(0, self.config.lambda_std_dev)
+
+
+class DiffLlamaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: DiffLlamaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class DiffLlamaModel(DiffLlamaPreTrainedModel):
+    def __init__(self, config: DiffLlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DiffLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DiffLlamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DiffLlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DiffLlamaForCausalLM
+
+        >>> model = DiffLlamaForCausalLM.from_pretrained("google/diffllama-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/diffllama-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DiffLlamaForSequenceClassification(GenericForSequenceClassification, DiffLlamaPreTrainedModel):
+    pass
+
+
+class DiffLlamaForQuestionAnswering(GenericForQuestionAnswering, DiffLlamaPreTrainedModel):
+    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`
+
+
+class DiffLlamaForTokenClassification(GenericForTokenClassification, DiffLlamaPreTrainedModel):
+    pass
+
+
+__all__ = [
+    "DiffLlamaPreTrainedModel",
+    "DiffLlamaModel",
+    "DiffLlamaForCausalLM",
+    "DiffLlamaForSequenceClassification",
+    "DiffLlamaForQuestionAnswering",
+    "DiffLlamaForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/diffllama/modular_diffllama.py b/phivenv/Lib/site-packages/transformers/models/diffllama/modular_diffllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0b7a9172d37b78c384177b29959b56ec0437d2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/diffllama/modular_diffllama.py
@@ -0,0 +1,447 @@
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache, StaticCache
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+from ..gemma.modeling_gemma import GemmaForCausalLM
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaForQuestionAnswering,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from ..mistral.modeling_mistral import MistralMLP
+from .configuration_diffllama import DiffLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut"
+_CONFIG_FOR_DOC = "DiffLlamaConfig"
+
+
+class DiffLlamaMLP(MistralMLP):
+    pass
+
+
+def lambda_init_fn(layer_idx):
+    return 0.8 - 0.6 * math.exp(-0.3 * layer_idx)
+
+
+class DiffLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        # under this are not used
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.lambda_init = lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, target_len, _ = hidden_states.size()
+        q_len = target_len
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiffLlamaFlashAttention2(DiffLlamaAttention):
+    """
+    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, None]:
+        if isinstance(past_key_values, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DiffLlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        value_states1, value_states2 = torch.chunk(value_states, 2, dim=2)
+        value_states1 = value_states1.repeat(1, 1, 2, 1)
+        value_states2 = value_states2.repeat(1, 1, 2, 1)
+
+        attn_output1 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states1,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output2 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states2,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = torch.cat([attn_output1, attn_output2], dim=-1)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+class DiffLlamaSdpaAttention(DiffLlamaAttention):
+    """
+    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from DiffLlamaAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = causal_mask is None and q_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+DIFFLLAMA_ATTENTION_CLASSES = {
+    "eager": DiffLlamaAttention,
+    "flash_attention_2": DiffLlamaFlashAttention2,
+    "sdpa": DiffLlamaSdpaAttention,
+}
+
+
+class DiffLlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: DiffLlamaConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+        self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+
+class DiffLlamaPreTrainedModel(LlamaPreTrainedModel):
+    _supports_flex_attn = False
+    _supports_attention_backend = False
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, DiffLlamaAttention):
+            module.lambda_q1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_q2.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k2.data.normal_(0, self.config.lambda_std_dev)
+
+
+class DiffLlamaModel(LlamaModel):
+    pass
+
+
+class DiffLlamaForCausalLM(GemmaForCausalLM):
+    pass
+
+
+class DiffLlamaForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class DiffLlamaForQuestionAnswering(LlamaForQuestionAnswering):
+    pass
+
+
+class DiffLlamaForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+__all__ = [
+    "DiffLlamaPreTrainedModel",
+    "DiffLlamaModel",  # noqa: F822
+    "DiffLlamaForCausalLM",
+    "DiffLlamaForSequenceClassification",
+    "DiffLlamaForQuestionAnswering",
+    "DiffLlamaForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinat/__init__.py b/phivenv/Lib/site-packages/transformers/models/dinat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b64cdbb3c7eb0467f6112225b8c0d9e1f65f9e99
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinat/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinat import *
+    from .modeling_dinat import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9e826cede32345dc026013ef7f83bb634998c2d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05dd5d569c937e044d304ac17c0914dd4befee4b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b21d898b055206d75d6be1e2692170ac97b4f0d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinat/configuration_dinat.py b/phivenv/Lib/site-packages/transformers/models/dinat/configuration_dinat.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d7fa509c5a3b2f5efc3b936cf1761b4ab0e107
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinat/configuration_dinat.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dilated Neighborhood Attention Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class DinatConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Dinat
+    [shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Dimensionality of patch embedding.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
+            Number of layers in each level of the encoder.
+        num_heads (`list[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        kernel_size (`int`, *optional*, defaults to 7):
+            Neighborhood Attention kernel size.
+        dilations (`list[list[int]]`, *optional*, defaults to `[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]]`):
+            Dilation value of each NA layer in the Transformer encoder.
+        mlp_ratio (`float`, *optional*, defaults to 3.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.0):
+            The initial value for the layer scale. Disabled if <=0.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import DinatConfig, DinatModel
+
+    >>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration
+    >>> configuration = DinatConfig()
+
+    >>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration
+    >>> model = DinatModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinat"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=64,
+        depths=[3, 4, 6, 5],
+        num_heads=[2, 4, 8, 16],
+        kernel_size=7,
+        dilations=[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]],
+        mlp_ratio=3.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        layer_scale_init_value=0.0,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.dilations = dilations
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
+        self.layer_scale_init_value = layer_scale_init_value
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["DinatConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinat/modeling_dinat.py b/phivenv/Lib/site-packages/transformers/models/dinat/modeling_dinat.py
new file mode 100644
index 0000000000000000000000000000000000000000..916fc94a79589067c99bda9fe93178696cf8b436
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinat/modeling_dinat.py
@@ -0,0 +1,876 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Dilated Neighborhood Attention Transformer model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    OptionalDependencyNotAvailable,
+    auto_docstring,
+    is_natten_available,
+    logging,
+    requires_backends,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_dinat import DinatConfig
+
+
+if is_natten_available():
+    from natten.functional import natten2dav, natten2dqkrpb
+else:
+
+    def natten2dqkrpb(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+    def natten2dav(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+
+logger = logging.get_logger(__name__)
+
+
+# drop_path and DinatDropPath are from the timm library.
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Dinat encoder's outputs, with potential hidden states and attentions.
+    """
+)
+class DinatEncoderOutput(ModelOutput):
+    r"""
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Dinat model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+class DinatModelOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+        Average pooling of the last layer hidden-state.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Dinat outputs for image classification.
+    """
+)
+class DinatImageClassifierOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Classification (or regression if config.num_labels==1) loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Classification (or regression if config.num_labels==1) scores (before SoftMax).
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class DinatEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = DinatPatchEmbeddings(config)
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor]:
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class DinatPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        patch_size = config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        self.num_channels = num_channels
+
+        if patch_size == 4:
+            pass
+        else:
+            # TODO: Support arbitrary patch sizes.
+            raise ValueError("Dinat only supports patch size of 4 at the moment.")
+
+        self.projection = nn.Sequential(
+            nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+            nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+        )
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values)
+        embeddings = embeddings.permute(0, 2, 3, 1)
+
+        return embeddings
+
+
+class DinatDownsampler(nn.Module):
+    """
+    Convolutional Downsampling Layer.
+
+    Args:
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.norm = norm_layer(2 * dim)
+
+    def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
+        input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        input_feature = self.norm(input_feature)
+        return input_feature
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Dinat
+class DinatDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class NeighborhoodAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size, dilation):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+
+        # rpb is learnable relative positional biases; same concept is used Swin.
+        self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        # Apply the scale factor before computing attention weights. It's usually more efficient because
+        # attention weights are typically a bigger tensor compared to query.
+        # It gives identical results because scalars are commutable in matrix multiplication.
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+
+        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, self.dilation)
+        context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class NeighborhoodAttentionOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class NeighborhoodAttentionModule(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size, dilation):
+        super().__init__()
+        self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size, dilation)
+        self.output = NeighborhoodAttentionOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class DinatIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class DinatOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class DinatLayer(nn.Module):
+    def __init__(self, config, dim, num_heads, dilation, drop_path_rate=0.0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.kernel_size = config.kernel_size
+        self.dilation = dilation
+        self.window_size = self.kernel_size * self.dilation
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = NeighborhoodAttentionModule(
+            config, dim, num_heads, kernel_size=self.kernel_size, dilation=self.dilation
+        )
+        self.drop_path = DinatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = DinatIntermediate(config, dim)
+        self.output = DinatOutput(config, dim)
+        self.layer_scale_parameters = (
+            nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
+            if config.layer_scale_init_value > 0
+            else None
+        )
+
+    def maybe_pad(self, hidden_states, height, width):
+        window_size = self.window_size
+        pad_values = (0, 0, 0, 0, 0, 0)
+        if height < window_size or width < window_size:
+            pad_l = pad_t = 0
+            pad_r = max(0, window_size - width)
+            pad_b = max(0, window_size - height)
+            pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
+            hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, height, width, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+        # pad hidden_states if they are smaller than kernel size x dilation
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+
+        attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
+
+        attention_output = attention_outputs[0]
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_output = attention_output[:, :height, :width, :].contiguous()
+
+        if self.layer_scale_parameters is not None:
+            attention_output = self.layer_scale_parameters[0] * attention_output
+
+        hidden_states = shortcut + self.drop_path(attention_output)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.output(self.intermediate(layer_output))
+
+        if self.layer_scale_parameters is not None:
+            layer_output = self.layer_scale_parameters[1] * layer_output
+
+        layer_output = hidden_states + self.drop_path(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+class DinatStage(nn.Module):
+    def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.layers = nn.ModuleList(
+            [
+                DinatLayer(
+                    config=config,
+                    dim=dim,
+                    num_heads=num_heads,
+                    dilation=dilations[i],
+                    drop_path_rate=drop_path_rate[i],
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        _, height, width, _ = hidden_states.size()
+        for i, layer_module in enumerate(self.layers):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states_before_downsampling)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+class DinatEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_levels = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+        self.levels = nn.ModuleList(
+            [
+                DinatStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    dilations=config.dilations[i_layer],
+                    drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=DinatDownsampler if (i_layer < self.num_levels - 1) else None,
+                )
+                for i_layer in range(self.num_levels)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, DinatEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            # rearrange b h w c -> b c h w
+            reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.levels):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            hidden_states_before_downsampling = layer_outputs[1]
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[2:]
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return DinatEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+@auto_docstring
+class DinatPreTrainedModel(PreTrainedModel):
+    config: DinatConfig
+    base_model_prefix = "dinat"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class DinatModel(DinatPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.config = config
+        self.num_levels = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))
+
+        self.embeddings = DinatEmbeddings(config)
+        self.encoder = DinatEncoder(config)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DinatModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return DinatModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """
+)
+class DinatForImageClassification(DinatPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.num_labels = config.num_labels
+        self.dinat = DinatModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.dinat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DinatImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.dinat(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DinatImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    NAT backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        requires_backends(self, ["natten"])
+
+        self.embeddings = DinatEmbeddings(config)
+        self.encoder = DinatEncoder(config)
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
+            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 512, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            output_hidden_states_before_downsampling=True,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                batch_size, num_channels, height, width = hidden_state.shape
+                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["DinatForImageClassification", "DinatModel", "DinatPreTrainedModel", "DinatBackbone"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/__init__.py b/phivenv/Lib/site-packages/transformers/models/dinov2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc316957eac509573bf44785209d0729ea13bb6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinov2 import *
+    from .modeling_dinov2 import *
+    from .modeling_flax_dinov2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4de87a6f20c27946d217f2f0fcb0017f2b4519dc
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cee7c8f57d4c135bd720fe580302fe80a55b94f8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2eb89c8ceea309c662464d44b63ce9d2de1c620
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d242dc23453c51119e27e7a293ef933c272f1502
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/configuration_dinov2.py b/phivenv/Lib/site-packages/transformers/models/dinov2/configuration_dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..55fa0539a23bf0c02d0079ce41f0c5228b88c904
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2/configuration_dinov2.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DINOv2 model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2Model`]. It is used to instantiate an
+    Dinov2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Dinov2
+    [google/dinov2-base-patch16-224](https://huggingface.co/google/dinov2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+        use_mask_token (`bool`, *optional*, defaults to `True`):
+            Whether to use mask_token in embeddings.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2Config, Dinov2Model
+
+    >>> # Initializing a Dinov2 dinov2-base-patch16-224 style configuration
+    >>> configuration = Dinov2Config()
+
+    >>> # Initializing a model (with random weights) from the dinov2-base-patch16-224 style configuration
+    >>> model = Dinov2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinov2"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=14,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        use_mask_token=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+        self.use_mask_token = use_mask_token
+
+
+class Dinov2OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+
+__all__ = ["Dinov2Config", "Dinov2OnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/modeling_dinov2.py b/phivenv/Lib/site-packages/transformers/models/dinov2/modeling_dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..96a051327e01733f96c3001da36f24aa4f36b08b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2/modeling_dinov2.py
@@ -0,0 +1,685 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DINOv2 model."""
+
+import collections.abc
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, logging, torch_int
+from ...utils.backbone_utils import BackboneMixin
+from ...utils.generic import can_return_tuple, check_model_inputs
+from .configuration_dinov2 import Dinov2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dinov2Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.patch_embeddings = Dinov2PatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.use_mask_token = config.use_mask_token
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        target_dtype = patch_pos_embed.dtype
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(torch.float32),
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        ).to(dtype=target_dtype)
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None and self.use_mask_token:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class Dinov2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
+class Dinov2SelfAttention(nn.Module):
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size = hidden_states.shape[0]
+        new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
+
+        key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
+        query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
+class Dinov2SelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
+class Dinov2Attention(nn.Module):
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        self.attention = Dinov2SelfAttention(config)
+        self.output = Dinov2SelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self_attn_output, _ = self.attention(hidden_states, head_mask)
+        output = self.output(self_attn_output, hidden_states)
+        return output
+
+
+class Dinov2LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class Dinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class Dinov2MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class Dinov2SwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class Dinov2Layer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = Dinov2Attention(config)
+        self.layer_scale1 = Dinov2LayerScale(config)
+        self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2SwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2MLP(config)
+        self.layer_scale2 = Dinov2LayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states_norm = self.norm1(hidden_states)
+        self_attention_output = self.attention(hidden_states_norm, head_mask)
+        self_attention_output = self.layer_scale1(self_attention_output)
+
+        # first residual connection
+        hidden_states = self.drop_path(self_attention_output) + hidden_states
+
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        return layer_output
+
+
+class Dinov2Encoder(nn.Module):
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Dinov2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_hidden_states: bool = False
+    ) -> BaseModelOutput:
+        all_hidden_states = [hidden_states] if output_hidden_states else None
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(hidden_states, layer_head_mask)
+            if all_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
+        )
+
+
+@auto_docstring
+class Dinov2PreTrainedModel(PreTrainedModel):
+    config: Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Dinov2Layer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "attentions": Dinov2SelfAttention,
+    }
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            if self.config.use_mask_token:
+                module.mask_token.data.zero_()
+        elif isinstance(module, Dinov2LayerScale):
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+@auto_docstring
+class Dinov2Model(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        """
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            embedding_output, head_mask=head_mask, output_hidden_states=output_hidden_states
+        )
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """
+)
+class Dinov2ForImageClassification(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.dinov2 = Dinov2Model(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs: BaseModelOutputWithPooling = self.dinov2(pixel_values, head_mask=head_mask, **kwargs)
+
+        sequence_output = outputs.last_hidden_state  # batch_size, sequence_length, hidden_size
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self, pixel_values: torch.Tensor, output_hidden_states: Optional[bool] = None, **kwargs
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        embedding_output = self.embeddings(pixel_values)
+        output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
+        hidden_states = output.hidden_states
+
+        feature_maps = []
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1:]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps.append(hidden_state)
+
+        return BackboneOutput(
+            feature_maps=tuple(feature_maps),
+            hidden_states=hidden_states if output_hidden_states else None,
+        )
+
+
+__all__ = ["Dinov2ForImageClassification", "Dinov2Model", "Dinov2PreTrainedModel", "Dinov2Backbone"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2/modeling_flax_dinov2.py b/phivenv/Lib/site-packages/transformers/models/dinov2/modeling_flax_dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ea2eaa3ebc5ac5db192dde220dfe114ef38235
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2/modeling_flax_dinov2.py
@@ -0,0 +1,801 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax DINOv2 model."""
+
+import collections.abc
+import math
+from typing import Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling, FlaxSequenceClassifierOutput
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from .configuration_dinov2 import Dinov2Config
+
+
+DINOV2_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+DINOV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`Dinov2ImageProcessor.__call__`]
+            for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxDinov2PatchEmbeddings(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        image_size = self.config.image_size
+        patch_size = self.config.patch_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.num_patches = num_patches
+        self.num_channels = self.config.num_channels
+        self.projection = nn.Conv(
+            self.config.hidden_size,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="VALID",
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+        )
+
+    # Copied from transformers.models.vit.modeling_flax_vit.FlaxViTPatchEmbeddings.__call__
+    def __call__(self, pixel_values):
+        num_channels = pixel_values.shape[-1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values)
+        batch_size, _, _, channels = embeddings.shape
+        return jnp.reshape(embeddings, (batch_size, -1, channels))
+
+
+class FlaxDinov2Embeddings(nn.Module):
+    """Construct the CLS token, position and patch embeddings."""
+
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.cls_token = self.param(
+            "cls_token",
+            jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+            (1, 1, self.config.hidden_size),
+        )
+        if self.config.use_mask_token:
+            self.mask_token = self.param(
+                "mask_token",
+                jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+                (1, self.config.hidden_size),
+            )
+        self.patch_embeddings = FlaxDinov2PatchEmbeddings(self.config, dtype=self.dtype)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = self.param(
+            "position_embeddings",
+            jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+            (1, num_patches + 1, self.config.hidden_size),
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, config, hidden_states, height, width, position_embeddings):
+        num_patches = hidden_states.shape[1] - 1
+        num_positions = position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return position_embeddings
+        class_pos_embed = position_embeddings[:, 0]
+        patch_pos_embed = position_embeddings[:, 1:]
+        dim = hidden_states.shape[-1]
+
+        h = height // config.patch_size
+        w = width // config.patch_size
+        height, width = h + 0.1, w + 0.1
+
+        patch_pos_embed = patch_pos_embed.reshape(
+            (1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        )
+        patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 3, 1, 2))
+        target_dtype = patch_pos_embed.dtype
+        new_height_ratio = jnp.float32(height / math.sqrt(num_positions))
+        new_width_ratio = jnp.float32(width / math.sqrt(num_positions))
+
+        scale = jnp.array([new_height_ratio, new_width_ratio], dtype=jnp.float32)
+        translation = jnp.array([0.0, 0.0], dtype=jnp.float32)
+
+        patch_pos_embed = jax.image.scale_and_translate(
+            patch_pos_embed.astype(jnp.float32),
+            shape=(patch_pos_embed.shape[0], patch_pos_embed.shape[1], h, w),
+            spatial_dims=(2, 3),
+            scale=scale,
+            translation=translation,
+            method="bicubic",
+            antialias=False,
+        )
+        patch_pos_embed = patch_pos_embed.astype(target_dtype)
+        patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 2, 3, 1)).reshape((position_embeddings.shape[0], -1, dim))
+        patch_pos_embed_expanded = jnp.tile(patch_pos_embed, (hidden_states.shape[0], 1, 1))
+        class_pos_embed_expanded = jnp.tile(class_pos_embed, (hidden_states.shape[0], 1, 1))
+
+        return jnp.concatenate((class_pos_embed_expanded, patch_pos_embed_expanded), axis=1)
+
+    def __call__(self, pixel_values, deterministic=True):
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embeddings.projection.dtype
+        height, width = pixel_values.shape[1], pixel_values.shape[2]
+
+        embeddings = self.patch_embeddings(pixel_values.astype(target_dtype))
+
+        cls_tokens = jnp.broadcast_to(self.cls_token, (batch_size, 1, self.config.hidden_size))
+        embeddings = jnp.concatenate((cls_tokens, embeddings), axis=1)
+
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            self.config, embeddings, height, width, self.position_embeddings
+        )
+
+        embeddings = self.dropout(embeddings, deterministic=deterministic)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTSelfAttention with ViT->Dinov2
+class FlaxDinov2SelfAttention(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`:"
+                " {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+
+    def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+
+        query_states = self.query(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        value_states = self.value(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        key_states = self.key(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTSelfOutput with ViT->Dinov2
+class FlaxDinov2SelfOutput(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTAttention with ViT->Dinov2
+class FlaxDinov2Attention(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.attention = FlaxDinov2SelfAttention(self.config, dtype=self.dtype)
+        self.output = FlaxDinov2SelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True, output_attentions: bool = False):
+        attn_outputs = self.attention(hidden_states, deterministic=deterministic, output_attentions=output_attentions)
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+def ones_with_scale(key, shape, scale, dtype=jnp.float32):
+    return jnp.ones(shape, dtype) * scale
+
+
+class FlaxDinov2LayerScale(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.lambda1 = self.config.layerscale_value * self.param(
+            "lambda1",
+            jax.nn.initializers.ones,
+            (self.config.hidden_size,),
+        )
+        self.lambda1 = self.lambda1 * self.config.layerscale_value
+
+    def __call__(self, hidden_states):
+        return self.lambda1 * hidden_states
+
+
+# Copied from transformers.models.beit.modeling_flax_beit.FlaxBeitDropPath with Beit -> Dinov2
+class FlaxDinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    rate: float
+
+    @nn.module.compact
+    def __call__(self, inputs, deterministic: Optional[bool] = True):
+        if self.rate == 0.0:
+            return inputs
+        keep_prob = 1.0 - self.rate
+        if deterministic:
+            return inputs
+        else:
+            shape = (inputs.shape[0],) + (1,) * (inputs.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+            rng = self.make_rng("droppath")
+            random_tensor = keep_prob + jax.random.uniform(rng, shape=shape, dtype=inputs.dtype)
+            binary_tensor = jnp.floor(random_tensor)
+            output = inputs / keep_prob * binary_tensor
+            return output
+
+
+class FlaxDinov2MLP(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.fc1 = nn.Dense(
+            self.config.hidden_size * self.config.mlp_ratio,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.fc2 = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        if isinstance(self.config.hidden_act, str):
+            self.act = ACT2FN[self.config.hidden_act]
+        else:
+            self.act = self.config.hidden_act
+
+    def __call__(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class FlaxDinov2SwiGLUFFN(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        hidden_features = int(self.config.hidden_size * self.config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Dense(
+            2 * hidden_features,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.weights_out = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        hidden_states = self.weights_in(hidden_states)
+        x1, x2 = jnp.split(hidden_states, 2, axis=-1)
+        hidden = nn.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class FlaxDinov2Layer(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.norm1 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.attention = FlaxDinov2Attention(self.config, dtype=self.dtype)
+        self.layer_scale1 = FlaxDinov2LayerScale(self.config, dtype=self.dtype)
+        self.drop_path = FlaxDinov2DropPath(self.config.drop_path_rate)
+        self.norm2 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+        if self.config.use_swiglu_ffn:
+            self.mlp = FlaxDinov2SwiGLUFFN(self.config, dtype=self.dtype)
+        else:
+            self.mlp = FlaxDinov2MLP(self.config, dtype=self.dtype)
+
+        self.layer_scale2 = FlaxDinov2LayerScale(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False):
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in Dinov2, layernorm is applied before self-attention
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = self_attention_outputs[0]
+
+        attention_output = self.layer_scale1(attention_output)
+
+        outputs = self_attention_outputs[1:]
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTLayerCollection with ViT->Dinov2
+class FlaxDinov2LayerCollection(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxDinov2Layer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(hidden_states, deterministic=deterministic, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTEncoder with ViT->Dinov2
+class FlaxDinov2Encoder(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layer = FlaxDinov2LayerCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class FlaxDinov2PreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: Dinov2Config,
+        input_shape=None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        if input_shape is None:
+            input_shape = (1, config.image_size, config.image_size, config.num_channels)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        pixel_values = jnp.zeros(input_shape, dtype=self.dtype)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        dropout_rng, droppath_rng = jax.random.split(dropout_rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng, "droppath": droppath_rng}
+
+        random_params = self.module.init(rngs, pixel_values, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        pixel_values,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            dropout_rng, droppath_rng = jax.random.split(dropout_rng)
+            rngs["dropout"] = dropout_rng
+            rngs["droppath"] = droppath_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(pixel_values, dtype=jnp.float32),
+            not train,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+        )
+
+
+class FlaxDinov2Module(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embeddings = FlaxDinov2Embeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxDinov2Encoder(self.config, dtype=self.dtype)
+        self.layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(
+        self,
+        pixel_values,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        hidden_states = self.embeddings(pixel_values, deterministic=deterministic)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+
+        return FlaxBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Dinov2 Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_START_DOCSTRING,
+)
+class FlaxDinov2Model(FlaxDinov2PreTrainedModel):
+    module_class = FlaxDinov2Module
+
+
+FLAX_VISION_MODEL_DOCSTRING = """
+    Returns:
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxDinov2Model
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+    >>> model = FlaxDinov2Model.from_pretrained("facebook/dinov2-base")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+overwrite_call_docstring(FlaxDinov2Model, FLAX_VISION_MODEL_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxDinov2Model, output_type=FlaxBaseModelOutputWithPooling, config_class=Dinov2Config
+)
+
+
+class FlaxDinov2ForImageClassificationModule(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dinov2 = FlaxDinov2Module(config=self.config, dtype=self.dtype)
+        self.classifier = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+        )
+
+    def __call__(
+        self,
+        pixel_values=None,
+        deterministic: bool = True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.dinov2(
+            pixel_values,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        cls_token = hidden_states[:, 0]
+        patch_tokens = hidden_states[:, 1:]
+        linear_input = jnp.concatenate([cls_token, patch_tokens.mean(axis=1)], axis=-1)
+
+        logits = self.classifier(linear_input)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class FlaxDinov2ForImageClassification(FlaxDinov2PreTrainedModel):
+    module_class = FlaxDinov2ForImageClassificationModule
+
+
+FLAX_VISION_CLASSIFICATION_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxDinov2ForImageClassification
+    >>> from PIL import Image
+    >>> import jax
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer")
+    >>> model = FlaxDinov2ForImageClassification.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer", from_pt=True)
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
+"""
+
+overwrite_call_docstring(FlaxDinov2ForImageClassification, FLAX_VISION_CLASSIFICATION_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxDinov2ForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=Dinov2Config
+)
+
+
+__all__ = ["FlaxDinov2ForImageClassification", "FlaxDinov2Model", "FlaxDinov2PreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__init__.py b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d10027b6a3b6375235a6785df044e8f0ce5fb33
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinov2_with_registers import *
+    from .modeling_dinov2_with_registers import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0984d541cfb3c0e40da7969608d5fb30bb6447e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/configuration_dinov2_with_registers.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/configuration_dinov2_with_registers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d7d92c20db72a6c6b9ac68e098d1356da403034
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/configuration_dinov2_with_registers.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/modeling_dinov2_with_registers.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/modeling_dinov2_with_registers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ce7638eba163644ccb212ae9fa7984c5903ba09
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/modeling_dinov2_with_registers.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/modular_dinov2_with_registers.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/modular_dinov2_with_registers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec60367ae9fd5eb065435a4cecef7e0c0cd8ffe9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/__pycache__/modular_dinov2_with_registers.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4f446fc684f40d634927c1e7a52b64c5732b12
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
@@ -0,0 +1,159 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dinov2_with_registers.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an
+    Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv2 with Registers
+    [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_register_tokens (`int`, *optional*, defaults to 4):
+            Number of register tokens to use.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel
+
+    >>> # Initializing a Dinov2WithRegisters base style configuration
+    >>> configuration = Dinov2WithRegistersConfig()
+
+    >>> # Initializing a model (with random weights) from the base style configuration
+    >>> model = Dinov2WithRegistersModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinov2_with_registers"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        num_register_tokens=4,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.num_register_tokens = num_register_tokens
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+
+
+__all__ = ["Dinov2WithRegistersConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a02ac4c58476939a48a96aed9c988b2e1e9425eb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@@ -0,0 +1,712 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dinov2_with_registers.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections.abc
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, torch_int
+from ...utils.backbone_utils import BackboneMixin
+from ...utils.generic import can_return_tuple, check_model_inputs
+from .configuration_dinov2_with_registers import Dinov2WithRegistersConfig
+
+
+class Dinov2WithRegistersPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class Dinov2WithRegistersEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, register tokens, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+        self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        # Handle class token and patch embeddings separately
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+
+        # Calculate new dimensions
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Store original dtype for restoration after interpolation
+        target_dtype = patch_pos_embed.dtype
+
+        # Interpolate at float32 precision
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
+            mode="bicubic",
+            align_corners=False,
+            antialias=True,
+        ).to(dtype=target_dtype)
+
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+
+        # Reshape back to original format
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        # Combine class and patch embeddings
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        # add register tokens
+        embeddings = torch.cat(
+            (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+        )
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Dinov2WithRegistersSelfAttention(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size = hidden_states.shape[0]
+        new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
+
+        key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
+        query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class Dinov2WithRegistersSelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2WithRegistersLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class Dinov2WithRegistersAttention(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        self.attention = Dinov2WithRegistersSelfAttention(config)
+        self.output = Dinov2WithRegistersSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self_attn_output, _ = self.attention(hidden_states, head_mask)
+        output = self.output(self_attn_output, hidden_states)
+        return output
+
+
+class Dinov2WithRegistersLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class Dinov2WithRegistersDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class Dinov2WithRegistersMLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class Dinov2WithRegistersSwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class Dinov2WithRegistersLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = Dinov2WithRegistersAttention(config)
+        self.layer_scale1 = Dinov2WithRegistersLayerScale(config)
+        self.drop_path = (
+            Dinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        )
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2WithRegistersSwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2WithRegistersMLP(config)
+        self.layer_scale2 = Dinov2WithRegistersLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states_norm = self.norm1(hidden_states)
+        self_attention_output = self.attention(hidden_states_norm, head_mask)
+        self_attention_output = self.layer_scale1(self_attention_output)
+
+        # first residual connection
+        hidden_states = self.drop_path(self_attention_output) + hidden_states
+
+        # in Dinov2WithRegisters, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        return layer_output
+
+
+class Dinov2WithRegistersEncoder(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Dinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_hidden_states: bool = False
+    ) -> BaseModelOutput:
+        all_hidden_states = [hidden_states] if output_hidden_states else None
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(hidden_states, layer_head_mask)
+            if all_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
+        )
+
+
+@auto_docstring
+class Dinov2WithRegistersPreTrainedModel(PreTrainedModel):
+    config: Dinov2WithRegistersConfig
+    base_model_prefix = "dinov2_with_registers"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Dinov2WithRegistersLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "attentions": Dinov2WithRegistersSelfAttention,
+    }
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2WithRegistersEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            module.mask_token.data.zero_()
+            module.register_tokens.data.zero_()
+        elif isinstance(module, Dinov2WithRegistersLayerScale):  # noqa: F821
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+@auto_docstring
+class Dinov2WithRegistersModel(Dinov2WithRegistersPreTrainedModel):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        """
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            embedding_output, head_mask=head_mask, output_hidden_states=output_hidden_states
+        )
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """
+)
+class Dinov2WithRegistersForImageClassification(Dinov2WithRegistersPreTrainedModel):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.dinov2_with_registers = Dinov2WithRegistersModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.dinov2_with_registers(pixel_values, head_mask=head_mask, **kwargs)
+        sequence_output = outputs.last_hidden_state  # batch_size, sequence_length, hidden_size
+
+        cls_token = sequence_output[:, 0]
+        # cls and register tokens should not be included in patch tokens variable
+        patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.num_register_tokens = config.num_register_tokens
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        embedding_output = self.embeddings(pixel_values)
+        output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
+        hidden_states = output.hidden_states
+
+        feature_maps = []
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1 + self.num_register_tokens :]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps.append(hidden_state)
+
+        return BackboneOutput(
+            feature_maps=tuple(feature_maps),
+            hidden_states=hidden_states if output_hidden_states else None,
+        )
+
+
+__all__ = [
+    "Dinov2WithRegistersPreTrainedModel",
+    "Dinov2WithRegistersModel",
+    "Dinov2WithRegistersForImageClassification",
+    "Dinov2WithRegistersBackbone",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
new file mode 100644
index 0000000000000000000000000000000000000000..c23e523e3434401110408e56772c26adb69efab3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
@@ -0,0 +1,436 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ....transformers.models.dinov2.modeling_dinov2 import (
+    Dinov2Backbone,
+    Dinov2Encoder,
+    Dinov2ForImageClassification,
+    Dinov2Model,
+    Dinov2PatchEmbeddings,
+    Dinov2PreTrainedModel,
+)
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging, torch_int
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an
+    Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv2 with Registers
+    [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_register_tokens (`int`, *optional*, defaults to 4):
+            Number of register tokens to use.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel
+
+    >>> # Initializing a Dinov2WithRegisters base style configuration
+    >>> configuration = Dinov2WithRegistersConfig()
+
+    >>> # Initializing a model (with random weights) from the base style configuration
+    >>> model = Dinov2WithRegistersModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinov2_with_registers"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        num_register_tokens=4,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.num_register_tokens = num_register_tokens
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+
+
+class Dinov2WithRegistersPatchEmbeddings(Dinov2PatchEmbeddings):
+    pass
+
+
+class Dinov2WithRegistersEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, register tokens, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+        self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        # Handle class token and patch embeddings separately
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+
+        # Calculate new dimensions
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Store original dtype for restoration after interpolation
+        target_dtype = patch_pos_embed.dtype
+
+        # Interpolate at float32 precision
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
+            mode="bicubic",
+            align_corners=False,
+            antialias=True,
+        ).to(dtype=target_dtype)
+
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+
+        # Reshape back to original format
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        # Combine class and patch embeddings
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        # add register tokens
+        embeddings = torch.cat(
+            (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+        )
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class Dinov2WithRegistersEncoder(Dinov2Encoder):
+    pass
+
+
+class Dinov2WithRegistersPreTrainedModel(Dinov2PreTrainedModel):
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2WithRegistersEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            module.mask_token.data.zero_()
+            module.register_tokens.data.zero_()
+        elif isinstance(module, Dinov2WithRegistersLayerScale):  # noqa: F821
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+class Dinov2WithRegistersModel(Dinov2Model):
+    pass
+
+
+class Dinov2WithRegistersForImageClassification(Dinov2ForImageClassification):
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.dinov2_with_registers(pixel_values, head_mask=head_mask, **kwargs)
+        sequence_output = outputs.last_hidden_state  # batch_size, sequence_length, hidden_size
+
+        cls_token = sequence_output[:, 0]
+        # cls and register tokens should not be included in patch tokens variable
+        patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Dinov2WithRegistersBackbone(Dinov2Backbone):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_register_tokens = config.num_register_tokens
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        embedding_output = self.embeddings(pixel_values)
+        output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
+        hidden_states = output.hidden_states
+
+        feature_maps = []
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1 + self.num_register_tokens :]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps.append(hidden_state)
+
+        return BackboneOutput(
+            feature_maps=tuple(feature_maps),
+            hidden_states=hidden_states if output_hidden_states else None,
+        )
+
+
+__all__ = [
+    "Dinov2WithRegistersConfig",
+    "Dinov2WithRegistersPreTrainedModel",
+    "Dinov2WithRegistersModel",
+    "Dinov2WithRegistersForImageClassification",
+    "Dinov2WithRegistersBackbone",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__init__.py b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8839dc7cec78158801a1b623c27dd46281aadb0d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinov3_convnext import *
+    from .modeling_dinov3_convnext import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc4b06215f1572e56b8776207eeab5ca2bd3ea4f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa5a9f7a199c0e8043ad30a0ac2d046061fe3ec2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d81ce68f9aba086ab2a9a4f6ae62f21f4b5d718a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa593e10ec1a685091234ee7966873570716276e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ConvNeXT model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DINOv3ConvNextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DINOv3ConvNextModel`]. It is used to instantiate an
+    DINOv3ConvNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv3ConvNext
+    [facebook/dinov3-convnext-tiny-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-convnext-tiny-pretrain-lvd1689m) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_sizes (`list[int]`, *optional*, defaults to [96, 192, 384, 768]):
+            Dimensionality (hidden size) at each stage.
+        depths (`list[int]`, *optional*, defaults to [3, 3, 9, 3]):
+            The number of layers for each stage.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-06):
+            The initial value for the layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop rate for stochastic depth.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of input images.
+
+    Example:
+    ```python
+    >>> from transformers import DINOv3ConvNextConfig, DINOv3ConvNextModel
+
+    >>> # Initializing a DINOv3ConvNext (tiny variant) style configuration
+    >>> config = DINOv3ConvNextConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = DINOv3ConvNextModel(config)
+
+    >>> # Accessing the model config
+    >>> config = model.config
+    ```"""
+
+    model_type = "dinov3_convnext"
+
+    def __init__(
+        self,
+        num_channels: int = 3,
+        hidden_sizes: Optional[list[int]] = None,
+        depths: Optional[list[int]] = None,
+        hidden_act: str = "gelu",
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-6,
+        layer_scale_init_value: float = 1e-6,
+        drop_path_rate: float = 0.0,
+        image_size: int = 224,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes
+        self.depths = [3, 3, 9, 3] if depths is None else depths
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.layer_scale_init_value = layer_scale_init_value
+        self.drop_path_rate = drop_path_rate
+        self.image_size = image_size
+
+    @property
+    def num_stages(self) -> int:
+        return len(self.hidden_sizes)
+
+
+__all__ = ["DINOv3ConvNextConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..2318faf1482494fa9da612784fe01db83d6efbec
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py
@@ -0,0 +1,261 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ConvNext model."""
+
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPoolingAndNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ...utils.generic import can_return_tuple
+from .configuration_dinov3_convnext import DINOv3ConvNextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->DINOv3ConvNext
+class DINOv3ConvNextDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class DINOv3ConvNextLayerNorm(nn.LayerNorm):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, *args, data_format="channels_last", **kwargs):
+        super().__init__(*args, **kwargs)
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {data_format}")
+        self.data_format = data_format
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
+        """
+        if self.data_format == "channels_first":
+            features = features.permute(0, 2, 3, 1)
+            features = super().forward(features)
+            features = features.permute(0, 3, 1, 2)
+        else:
+            features = super().forward(features)
+        return features
+
+
+class DINOv3ConvNextLayer(nn.Module):
+    """This corresponds to the `Block` class in the original implementation.
+
+    There are two equivalent implementations:
+     1) DwConv, LayerNorm (channels_first), Conv, GELU, Conv (all in (N, C, H, W) format)
+     2) DwConv, Permute, LayerNorm (channels_last), Linear, GELU, Linear, Permute
+
+    The authors used (2) as they find it slightly faster in PyTorch.
+
+    Args:
+        config ([`DINOv3ConvNextConfig`]):
+            Model config.
+        channels (`int`):
+            Number of input (and output) channels.
+        drop_path (`float`):
+            Drop path rate. Default: 0.0.
+    """
+
+    def __init__(self, config: DINOv3ConvNextConfig, channels: int, drop_path: float = 0.0):
+        super().__init__()
+        self.depthwise_conv = nn.Conv2d(channels, channels, kernel_size=7, padding=3, groups=channels)
+        self.layer_norm = DINOv3ConvNextLayerNorm(channels, eps=config.layer_norm_eps)
+        self.pointwise_conv1 = nn.Linear(channels, 4 * channels)  # can be seen as a 1x1 conv
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Linear(4 * channels, channels)  # can be seen as a 1x1 conv
+        self.gamma = nn.Parameter(torch.full((channels,), config.layer_scale_init_value), requires_grad=True)
+        self.drop_path = DINOv3ConvNextDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width)
+        """
+        residual = features
+        features = self.depthwise_conv(features)
+        features = features.permute(0, 2, 3, 1)  # to channels last
+        features = self.layer_norm(features)
+        features = self.pointwise_conv1(features)
+        features = self.activation_fn(features)
+        features = self.pointwise_conv2(features)
+        features = features * self.gamma
+        features = features.permute(0, 3, 1, 2)  # back to channels first
+        features = residual + self.drop_path(features)
+        return features
+
+
+class DINOv3ConvNextStage(nn.Module):
+    """ """
+
+    def __init__(self, config: DINOv3ConvNextConfig, stage_idx: int):
+        super().__init__()
+
+        in_channels = config.hidden_sizes[stage_idx - 1] if stage_idx > 0 else config.num_channels
+        out_channels = config.hidden_sizes[stage_idx]
+
+        if stage_idx == 0:
+            self.downsample_layers = nn.ModuleList(
+                [
+                    nn.Conv2d(config.num_channels, out_channels, kernel_size=4, stride=4),
+                    DINOv3ConvNextLayerNorm(out_channels, eps=config.layer_norm_eps, data_format="channels_first"),
+                ]
+            )
+        else:
+            self.downsample_layers = nn.ModuleList(
+                [
+                    DINOv3ConvNextLayerNorm(in_channels, eps=config.layer_norm_eps, data_format="channels_first"),
+                    nn.Conv2d(in_channels, out_channels, kernel_size=2, stride=2),
+                ]
+            )
+
+        num_stage_layers = config.depths[stage_idx]
+        num_previous_layers = sum(config.depths[:stage_idx])
+        num_total_layers = sum(config.depths)
+        drop_path_rates = np.linspace(0, config.drop_path_rate, num_total_layers).tolist()
+
+        self.layers = nn.ModuleList(
+            [
+                DINOv3ConvNextLayer(config, channels=out_channels, drop_path=drop_path_rates[i])
+                for i in range(num_previous_layers, num_previous_layers + num_stage_layers)
+            ]
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width)
+        """
+        for layer in self.downsample_layers:
+            features = layer(features)
+        for layer in self.layers:
+            features = layer(features)
+        return features
+
+
+@auto_docstring
+class DINOv3ConvNextPreTrainedModel(PreTrainedModel):
+    config: DINOv3ConvNextConfig
+    base_model_prefix = "dinov3_convnext"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["DINOv3ConvNextLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, DINOv3ConvNextLayerNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, DINOv3ConvNextLayer):
+            if module.gamma is not None:
+                module.gamma.data.fill_(self.config.layer_scale_init_value)
+
+
+@auto_docstring
+class DINOv3ConvNextModel(DINOv3ConvNextPreTrainedModel):
+    def __init__(self, config: DINOv3ConvNextConfig):
+        super().__init__(config)
+        self.config = config
+        self.stages = nn.ModuleList([DINOv3ConvNextStage(config, stage_idx) for stage_idx in range(config.num_stages)])
+        self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)  # final norm layer
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self, pixel_values: torch.FloatTensor, output_hidden_states: Optional[bool] = None
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        hidden_states = pixel_values
+
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        all_hidden_states = [hidden_states] if output_hidden_states else []
+
+        for stage in self.stages:
+            hidden_states = stage(hidden_states)
+
+            # store intermediate stage outputs
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        # make global representation, a.k.a [CLS] token
+        pooled_output = self.pool(hidden_states)
+
+        # (batch_size, channels, height, width) -> (batch_size, height * width, channels)
+        pooled_output = pooled_output.flatten(2).transpose(1, 2)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        # concat "cls" and "patch tokens" as (batch_size, 1 + height * width, channels)
+        hidden_states = torch.cat([pooled_output, hidden_states], dim=1)
+        hidden_states = self.layer_norm(hidden_states)
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=hidden_states,
+            pooler_output=hidden_states[:, 0],
+            hidden_states=tuple(all_hidden_states) if output_hidden_states else None,
+        )
+
+
+__all__ = ["DINOv3ConvNextModel", "DINOv3ConvNextPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__init__.py b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a74878b2053cf43fabe19a7fd72e020a0879f8e6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinov3_vit import *
+    from .image_processing_dinov3_vit_fast import *
+    from .modeling_dinov3_vit import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b1fc290ed51cb576316efff1a5929a63d7c1f9c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..801e1f48c88f58e8f225d4b5aabadc156584dcbe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3aa1d2699b8c72611bb6b615a73124878799e30b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09069657eff4288926deae5994e71e6b7d3706ad
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c0d37278e6757e7818cffe1188f38774879973d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/configuration_dinov3_vit.py b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/configuration_dinov3_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..78cbd200ce612e6c778392c85d2f8c97a7d19c82
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/configuration_dinov3_vit.py
@@ -0,0 +1,166 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DINOv3 model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DINOv3ViTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DINOv3Model`]. It is used to instantiate an
+    DINOv3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv3
+    [facebook/dinov3-vits16-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-vits16-pretrain-lvd1689m) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_size (`int`, *optional*, defaults to 384):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        rope_theta (`float`, *optional*, defaults to 100.0):
+            The base period of the RoPE embeddings.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        query_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the query projection.
+        key_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the key projection.
+        value_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the value projection.
+        proj_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the output projection.
+        mlp_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the MLP layers.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+            Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_gated_mlp (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_register_tokens (`int`, *optional*, defaults to 0):
+            The number of register tokens.
+        pos_embed_shift (`float`, *optional*):
+            Amount to randomly shift position embedding coordinates in [-shift, shift],
+            applied only in training mode if not `None`.
+        pos_embed_jitter (`float`, *optional*):
+            Amount to randomly jitter position embedding coordinates in log-uniform value in [1/jitter, jitter],
+            applied only in training mode if not `None`.
+        pos_embed_rescale (`float`, *optional*, defaults to 2.0):
+            Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale],
+            applied only in training mode if not `None`.
+
+    Example:
+
+    ```python
+    >>> from transformers import DINOv3ViTConfig, DINOv3ViTModel
+
+    >>> # Initializing a DINOv3 ViT-small style configuration
+    >>> config = DINOv3ViTConfig()
+
+    >>> # Initializing a model (with random weights) from the config
+    >>> model = DINOv3ViTModel(config)
+
+    >>> # Accessing the model config
+    >>> config = model.config
+    ```"""
+
+    model_type = "dinov3_vit"
+
+    def __init__(
+        self,
+        patch_size: int = 16,
+        hidden_size: int = 384,
+        intermediate_size: int = 1536,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 6,
+        hidden_act: str = "gelu",
+        attention_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-5,
+        rope_theta: float = 100.0,
+        image_size: int = 224,
+        num_channels: int = 3,
+        query_bias: bool = True,
+        key_bias: bool = False,
+        value_bias: bool = True,
+        proj_bias: bool = True,
+        mlp_bias: bool = True,
+        layerscale_value: float = 1.0,
+        drop_path_rate: float = 0.0,
+        use_gated_mlp: bool = False,
+        num_register_tokens: int = 0,
+        # train augs
+        pos_embed_shift: Optional[float] = None,
+        pos_embed_jitter: Optional[float] = None,
+        pos_embed_rescale: Optional[float] = 2.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_gated_mlp = use_gated_mlp
+        self.rope_theta = rope_theta
+        self.query_bias = query_bias
+        self.key_bias = key_bias
+        self.value_bias = value_bias
+        self.proj_bias = proj_bias
+        self.mlp_bias = mlp_bias
+        self.num_register_tokens = num_register_tokens
+
+        # train augs
+        self.pos_embed_shift = pos_embed_shift
+        self.pos_embed_jitter = pos_embed_jitter
+        self.pos_embed_rescale = pos_embed_rescale
+
+
+__all__ = ["DINOv3ViTConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..3664bdd20ae8f9ffd5419f2a95d80a2a80ebef6e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DINOv3."""
+
+from typing import Optional, Union
+
+from transformers.image_processing_base import BatchFeature
+from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling, SizeDict
+from transformers.utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+from transformers.utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+@auto_docstring
+@requires(backends=("torchvision", "torch"))
+class DINOv3ViTImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    size = {"height": 224, "width": 224}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+
+    # Overriden for DINOv3 to preserve order of transforms
+    # rescale -> resize -> normalize
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_rescale:
+                stacked_images = self.rescale(stacked_images, rescale_factor)
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images, size=size, interpolation=interpolation, antialias=True
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            if do_normalize:
+                stacked_images = self.normalize(stacked_images, image_mean, image_std)
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["DINOv3ViTImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/modeling_dinov3_vit.py b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/modeling_dinov3_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbea73e6caf50a31aab79a522b8a9713ff035de8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/modeling_dinov3_vit.py
@@ -0,0 +1,538 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dinov3_vit/modular_dinov3_vit.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dinov3_vit.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Callable, Optional
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.generic import check_model_inputs
+from .configuration_dinov3_vit import DINOv3ViTConfig
+
+
+class DINOv3ViTEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__()
+        self.config = config
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.empty(1, config.num_register_tokens, config.hidden_size))
+        self.patch_embeddings = nn.Conv2d(
+            config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
+        )
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embeddings.weight.dtype
+
+        # (batch_size, num_channels, height, width) -> (batch_size, num_patches, hidden_size)
+        patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)
+
+        if bool_masked_pos is not None:
+            mask_token = self.mask_token.to(patch_embeddings.dtype)
+            patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings)
+
+        # Add CLS and register tokens
+        cls_token = self.cls_token.expand(batch_size, -1, -1)
+        register_tokens = self.register_tokens.expand(batch_size, -1, -1)
+        embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1)
+
+        return embeddings
+
+
+@compile_compatible_method_lru_cache(maxsize=32)
+def get_patches_center_coordinates(
+    num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    """
+    Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1].
+    The center of each patch is exactly halfway between its top-left and bottom-right corners.
+
+    Args:
+        num_patches_h (int): Number of patches along the vertical (height) axis.
+        num_patches_w (int): Number of patches along the horizontal (width) axis.
+        dtype (torch.dtype): The desired data type of the returned tensor.
+
+    Returns:
+        torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x)
+            coordinates of a patch center, normalized to [-1, +1].
+    """
+    coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device)
+    coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device)
+    coords_h = coords_h / num_patches_h
+    coords_w = coords_w / num_patches_w
+    # (height, width, 2) -> (height * width, 2)
+    coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
+    coords = coords.flatten(0, 1)
+    # Shift range [0, 1] to [-1, +1]
+    coords = 2.0 * coords - 1.0
+    return coords
+
+
+def augment_patches_center_coordinates(
+    coords: torch.Tensor,
+    shift: Optional[float] = None,
+    jitter: Optional[float] = None,
+    rescale: Optional[float] = None,
+) -> torch.Tensor:
+    # Shift coords by adding a uniform value in [-shift, shift]
+    if shift is not None:
+        shift_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype)
+        shift_hw = shift_hw.uniform_(-shift, shift)
+        coords = coords + shift_hw
+
+    # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter]
+    if jitter is not None:
+        jitter_range = np.log(jitter)
+        jitter_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype)
+        jitter_hw = jitter_hw.uniform_(-jitter_range, jitter_range).exp()
+        coords = coords * jitter_hw
+
+    # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale]
+    if rescale is not None:
+        rescale_range = np.log(rescale)
+        rescale_hw = torch.empty(1, device=coords.device, dtype=coords.dtype)
+        rescale_hw = rescale_hw.uniform_(-rescale_range, rescale_range).exp()
+        coords = coords * rescale_hw
+
+    return coords
+
+
+class DINOv3ViTRopePositionEmbedding(nn.Module):
+    inv_freq: torch.Tensor
+
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__()
+
+        self.config = config
+        self.base = config.rope_theta
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.num_patches_h = config.image_size // config.patch_size
+        self.num_patches_w = config.image_size // config.patch_size
+
+        inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32)  # (head_dim / 4,)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        _, _, height, width = pixel_values.shape
+        num_patches_h = height // self.config.patch_size
+        num_patches_w = width // self.config.patch_size
+
+        device = pixel_values.device
+        device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu"
+
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            # Although we could precompute static patch_coords from image_size and patch_size in the config,
+            # the model was trained with random_scale, so it can process images of varying sizes.
+            # Therefore, it's better to compute patch_coords dynamically (with lru_cache).
+            patch_coords = get_patches_center_coordinates(
+                num_patches_h, num_patches_w, dtype=torch.float32, device=device
+            )
+            if self.training:
+                patch_coords = augment_patches_center_coordinates(
+                    patch_coords,
+                    shift=self.config.pos_embed_shift,
+                    jitter=self.config.pos_embed_jitter,
+                    rescale=self.config.pos_embed_rescale,
+                )
+
+            # (height * width, 2, head_dim / 4) -> (height * width, head_dim / 2) -> (height * width, head_dim)
+            angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
+            angles = angles.flatten(1, 2)
+            angles = angles.tile(2)
+
+            cos = torch.cos(angles)
+            sin = torch.sin(angles)
+
+        dtype = pixel_values.dtype
+        return cos.to(dtype=dtype), sin.to(dtype=dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, **kwargs
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens,
+    ignoring the prefix tokens (cls token and register tokens).
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+
+    num_tokens = q.shape[-2]
+    num_patches = sin.shape[-2]
+    num_prefix_tokens = num_tokens - num_patches  # cls token + register tokens
+
+    q_prefix_tokens, q_patches = q.split((num_prefix_tokens, num_patches), dim=-2)
+    k_prefix_tokens, k_patches = k.split((num_prefix_tokens, num_patches), dim=-2)
+
+    # apply rope only to patch tokens
+    q_patches = (q_patches * cos) + (rotate_half(q_patches) * sin)
+    k_patches = (k_patches * cos) + (rotate_half(k_patches) * sin)
+
+    q = torch.cat((q_prefix_tokens, q_patches), dim=-2)
+    k = torch.cat((k_prefix_tokens, k_patches), dim=-2)
+
+    return q, k
+
+
+class DINOv3ViTAttention(nn.Module):
+    """
+    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
+    """
+
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.is_causal = False
+
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = False
+
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.key_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.value_bias)
+
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.query_bias)
+        self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.proj_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, patches, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class DINOv3ViTLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class DINOv3ViTDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class DINOv3ViTMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.up_proj(x)))
+
+
+class DINOv3ViTGatedMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class DINOv3ViTLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = DINOv3ViTAttention(config)
+        self.layer_scale1 = DINOv3ViTLayerScale(config)
+        self.drop_path = DINOv3ViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_gated_mlp:
+            self.mlp = DINOv3ViTGatedMLP(config)
+        else:
+            self.mlp = DINOv3ViTMLP(config)
+        self.layer_scale2 = DINOv3ViTLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        # Attention with residual connection
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states, _ = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = self.layer_scale1(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+
+        # MLP with residual connection
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.layer_scale2(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+
+        return hidden_states
+
+
+@auto_docstring
+class DINOv3ViTPreTrainedModel(PreTrainedModel):
+    config: DINOv3ViTConfig
+    base_model_prefix = "dinov3_vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DINOv3ViTLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": DINOv3ViTLayer,
+        "attentions": DINOv3ViTAttention,
+    }
+
+    def _init_weights(self, module) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, DINOv3ViTEmbeddings):
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+            if module.config.num_register_tokens > 0:
+                module.register_tokens.data = nn.init.trunc_normal_(
+                    module.register_tokens.data.to(torch.float32),
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                ).to(module.register_tokens.dtype)
+            module.mask_token.data.zero_()
+        elif isinstance(module, DINOv3ViTLayerScale):
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+@auto_docstring
+class DINOv3ViTModel(DINOv3ViTPreTrainedModel):
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = DINOv3ViTEmbeddings(config)
+        self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config)
+        self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        """
+
+        pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
+        hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        position_embeddings = self.rope_embeddings(pixel_values)
+
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask=layer_head_mask,
+                position_embeddings=position_embeddings,
+            )
+
+        sequence_output = self.norm(hidden_states)
+        pooled_output = sequence_output[:, 0, :]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+        )
+
+
+__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dinov3_vit/modular_dinov3_vit.py b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/modular_dinov3_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4a1e69beaacea50504d3865f7fc5bfdea1dfdc5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dinov3_vit/modular_dinov3_vit.py
@@ -0,0 +1,429 @@
+# coding=utf-8
+# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DINOv3 model."""
+
+import math
+from typing import Callable, Optional
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.models.arcee.modeling_arcee import ArceeMLP
+from transformers.models.dinov2.modeling_dinov2 import (
+    Dinov2DropPath,
+    Dinov2LayerScale,
+    Dinov2PreTrainedModel,
+    eager_attention_forward,
+)
+from transformers.models.llama.modeling_llama import LlamaMLP
+from transformers.models.pixtral.modeling_pixtral import PixtralAttention, rotate_half
+
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.generic import check_model_inputs
+from .configuration_dinov3_vit import DINOv3ViTConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DINOv3ViTEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__()
+        self.config = config
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.empty(1, config.num_register_tokens, config.hidden_size))
+        self.patch_embeddings = nn.Conv2d(
+            config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
+        )
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embeddings.weight.dtype
+
+        # (batch_size, num_channels, height, width) -> (batch_size, num_patches, hidden_size)
+        patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)
+
+        if bool_masked_pos is not None:
+            mask_token = self.mask_token.to(patch_embeddings.dtype)
+            patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings)
+
+        # Add CLS and register tokens
+        cls_token = self.cls_token.expand(batch_size, -1, -1)
+        register_tokens = self.register_tokens.expand(batch_size, -1, -1)
+        embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1)
+
+        return embeddings
+
+
+@compile_compatible_method_lru_cache(maxsize=32)
+def get_patches_center_coordinates(
+    num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    """
+    Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1].
+    The center of each patch is exactly halfway between its top-left and bottom-right corners.
+
+    Args:
+        num_patches_h (int): Number of patches along the vertical (height) axis.
+        num_patches_w (int): Number of patches along the horizontal (width) axis.
+        dtype (torch.dtype): The desired data type of the returned tensor.
+
+    Returns:
+        torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x)
+            coordinates of a patch center, normalized to [-1, +1].
+    """
+    coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device)
+    coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device)
+    coords_h = coords_h / num_patches_h
+    coords_w = coords_w / num_patches_w
+    # (height, width, 2) -> (height * width, 2)
+    coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
+    coords = coords.flatten(0, 1)
+    # Shift range [0, 1] to [-1, +1]
+    coords = 2.0 * coords - 1.0
+    return coords
+
+
+def augment_patches_center_coordinates(
+    coords: torch.Tensor,
+    shift: Optional[float] = None,
+    jitter: Optional[float] = None,
+    rescale: Optional[float] = None,
+) -> torch.Tensor:
+    # Shift coords by adding a uniform value in [-shift, shift]
+    if shift is not None:
+        shift_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype)
+        shift_hw = shift_hw.uniform_(-shift, shift)
+        coords = coords + shift_hw
+
+    # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter]
+    if jitter is not None:
+        jitter_range = np.log(jitter)
+        jitter_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype)
+        jitter_hw = jitter_hw.uniform_(-jitter_range, jitter_range).exp()
+        coords = coords * jitter_hw
+
+    # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale]
+    if rescale is not None:
+        rescale_range = np.log(rescale)
+        rescale_hw = torch.empty(1, device=coords.device, dtype=coords.dtype)
+        rescale_hw = rescale_hw.uniform_(-rescale_range, rescale_range).exp()
+        coords = coords * rescale_hw
+
+    return coords
+
+
+class DINOv3ViTRopePositionEmbedding(nn.Module):
+    inv_freq: torch.Tensor
+
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__()
+
+        self.config = config
+        self.base = config.rope_theta
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.num_patches_h = config.image_size // config.patch_size
+        self.num_patches_w = config.image_size // config.patch_size
+
+        inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32)  # (head_dim / 4,)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        _, _, height, width = pixel_values.shape
+        num_patches_h = height // self.config.patch_size
+        num_patches_w = width // self.config.patch_size
+
+        device = pixel_values.device
+        device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu"
+
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            # Although we could precompute static patch_coords from image_size and patch_size in the config,
+            # the model was trained with random_scale, so it can process images of varying sizes.
+            # Therefore, it's better to compute patch_coords dynamically (with lru_cache).
+            patch_coords = get_patches_center_coordinates(
+                num_patches_h, num_patches_w, dtype=torch.float32, device=device
+            )
+            if self.training:
+                patch_coords = augment_patches_center_coordinates(
+                    patch_coords,
+                    shift=self.config.pos_embed_shift,
+                    jitter=self.config.pos_embed_jitter,
+                    rescale=self.config.pos_embed_rescale,
+                )
+
+            # (height * width, 2, head_dim / 4) -> (height * width, head_dim / 2) -> (height * width, head_dim)
+            angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
+            angles = angles.flatten(1, 2)
+            angles = angles.tile(2)
+
+            cos = torch.cos(angles)
+            sin = torch.sin(angles)
+
+        dtype = pixel_values.dtype
+        return cos.to(dtype=dtype), sin.to(dtype=dtype)
+
+
+def apply_rotary_pos_emb(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, **kwargs
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens,
+    ignoring the prefix tokens (cls token and register tokens).
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+
+    num_tokens = q.shape[-2]
+    num_patches = sin.shape[-2]
+    num_prefix_tokens = num_tokens - num_patches  # cls token + register tokens
+
+    q_prefix_tokens, q_patches = q.split((num_prefix_tokens, num_patches), dim=-2)
+    k_prefix_tokens, k_patches = k.split((num_prefix_tokens, num_patches), dim=-2)
+
+    # apply rope only to patch tokens
+    q_patches = (q_patches * cos) + (rotate_half(q_patches) * sin)
+    k_patches = (k_patches * cos) + (rotate_half(k_patches) * sin)
+
+    q = torch.cat((q_prefix_tokens, q_patches), dim=-2)
+    k = torch.cat((k_prefix_tokens, k_patches), dim=-2)
+
+    return q, k
+
+
+class DINOv3ViTAttention(PixtralAttention):
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__(config)
+
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.query_bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.key_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.value_bias)
+        self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.proj_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, patches, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class DINOv3ViTLayerScale(Dinov2LayerScale):
+    pass
+
+
+class DINOv3ViTDropPath(Dinov2DropPath):
+    pass
+
+
+class DINOv3ViTMLP(ArceeMLP):
+    pass
+
+
+class DINOv3ViTGatedMLP(LlamaMLP):
+    pass
+
+
+class DINOv3ViTLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = DINOv3ViTAttention(config)
+        self.layer_scale1 = DINOv3ViTLayerScale(config)
+        self.drop_path = DINOv3ViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_gated_mlp:
+            self.mlp = DINOv3ViTGatedMLP(config)
+        else:
+            self.mlp = DINOv3ViTMLP(config)
+        self.layer_scale2 = DINOv3ViTLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        # Attention with residual connection
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states, _ = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = self.layer_scale1(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+
+        # MLP with residual connection
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.layer_scale2(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+
+        return hidden_states
+
+
+@auto_docstring
+class DINOv3ViTPreTrainedModel(Dinov2PreTrainedModel):
+    _can_record_outputs = {
+        "hidden_states": DINOv3ViTLayer,
+        "attentions": DINOv3ViTAttention,
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, DINOv3ViTEmbeddings):
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+            if module.config.num_register_tokens > 0:
+                module.register_tokens.data = nn.init.trunc_normal_(
+                    module.register_tokens.data.to(torch.float32),
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                ).to(module.register_tokens.dtype)
+            module.mask_token.data.zero_()
+        elif isinstance(module, DINOv3ViTLayerScale):
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+@auto_docstring
+class DINOv3ViTModel(DINOv3ViTPreTrainedModel):
+    def __init__(self, config: DINOv3ViTConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = DINOv3ViTEmbeddings(config)
+        self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config)
+        self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        """
+
+        pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
+        hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        position_embeddings = self.rope_embeddings(pixel_values)
+
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask=layer_head_mask,
+                position_embeddings=position_embeddings,
+            )
+
+        sequence_output = self.norm(hidden_states)
+        pooled_output = sequence_output[:, 0, :]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+        )
+
+
+__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__init__.py b/phivenv/Lib/site-packages/transformers/models/distilbert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d6fae2e0236e7619988f0cfa3502ed49d0f90b0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/distilbert/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_distilbert import *
+    from .modeling_distilbert import *
+    from .modeling_flax_distilbert import *
+    from .modeling_tf_distilbert import *
+    from .tokenization_distilbert import *
+    from .tokenization_distilbert_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..797f32e7b1e9122733c74948ba94a1fc76034945
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..172d6a2f758f5bc6d06bd61197628005cf4006c3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..506aebbe82bf436f6d5e0560d732e5aea565be2f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2baff5de1bbe2a6aed0826fb245fe13b2e14f145
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e5f920634568d01f276ca3f61d7bd5d8a4071f3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5dae836fd141874be259bf5932caf3ea6ffce7c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7dd40f64a37749c1731537e693d7b7c380f6b50
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/configuration_distilbert.py b/phivenv/Lib/site-packages/transformers/models/distilbert/configuration_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aa6d2dfd7c589c891204b8301d7677b097e29d8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/distilbert/configuration_distilbert.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DistilBERT model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DistilBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DistilBertModel`] or a [`TFDistilBertModel`]. It
+    is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT
+    [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DistilBertModel`] or [`TFDistilBertModel`].
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`):
+            Whether to use sinusoidal positional embeddings.
+        n_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        n_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_dim (`int`, *optional*, defaults to 3072):
+            The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qa_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probabilities used in the question answering model [`DistilBertForQuestionAnswering`].
+        seq_classif_dropout (`float`, *optional*, defaults to 0.2):
+            The dropout probabilities used in the sequence classification and the multiple choice model
+            [`DistilBertForSequenceClassification`].
+
+    Examples:
+
+    ```python
+    >>> from transformers import DistilBertConfig, DistilBertModel
+
+    >>> # Initializing a DistilBERT configuration
+    >>> configuration = DistilBertConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DistilBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "distilbert"
+    attribute_map = {
+        "hidden_size": "dim",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=512,
+        sinusoidal_pos_embds=False,
+        n_layers=6,
+        n_heads=12,
+        dim=768,
+        hidden_dim=4 * 768,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation="gelu",
+        initializer_range=0.02,
+        qa_dropout=0.1,
+        seq_classif_dropout=0.2,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.sinusoidal_pos_embds = sinusoidal_pos_embds
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation = activation
+        self.initializer_range = initializer_range
+        self.qa_dropout = qa_dropout
+        self.seq_classif_dropout = seq_classif_dropout
+        super().__init__(**kwargs, pad_token_id=pad_token_id)
+
+
+class DistilBertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["DistilBertConfig", "DistilBertOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_distilbert.py b/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..58f5aed348f0782fdeef29b0a5ed8447cfc73ad6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_distilbert.py
@@ -0,0 +1,1289 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
+part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
+"""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import get_activation
+from ...configuration_utils import PretrainedConfig
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import (
+    auto_docstring,
+    logging,
+)
+from .configuration_distilbert import DistilBertConfig
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+
+# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
+
+
+def create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
+    if is_deepspeed_zero3_enabled():
+        import deepspeed
+
+        with deepspeed.zero.GatheredParameters(out, modifier_rank=0):
+            if torch.distributed.get_rank() == 0:
+                _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)
+    else:
+        _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)
+
+
+def _create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    out.requires_grad = False
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+
+
+class Embeddings(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
+
+        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
+        self.dropout = nn.Dropout(config.dropout)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Parameters:
+            input_ids (torch.Tensor):
+                torch.tensor(bs, max_seq_length) The token ids to embed.
+            input_embeds (*optional*, torch.Tensor):
+                The pre-computed word embeddings. Can only be passed if the input ids are `None`.
+
+
+        Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
+        embeddings)
+        """
+        if input_ids is not None:
+            input_embeds = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
+
+        seq_length = input_embeds.size(1)
+
+        # Setting the position-ids to the registered buffer in constructor, it helps
+        # when tracing the model without passing position-ids, solves
+        # issues similar to issue #5664
+        if hasattr(self, "position_ids"):
+            position_ids = self.position_ids[:, :seq_length]
+        else:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)
+
+        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
+
+        embeddings = input_embeds + position_embeddings  # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
+        return embeddings
+
+
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+        self.is_causal = False
+
+        # Have an even number of multi heads that divide the dimensions
+        if self.dim % self.n_heads != 0:
+            # Raise value errors for even multi-head attention nodes
+            raise ValueError(f"self.n_heads: {self.n_heads} must divide self.dim: {self.dim} evenly")
+
+        self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+
+        self.pruned_heads: set[int] = set()
+        self.attention_head_size = self.dim // self.n_heads
+
+    def prune_heads(self, heads: list[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = self.attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Parameters:
+            query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
+
+        Returns:
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        bs, q_length, dim = query.size()
+        k_length = key.size(1)
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        # assert key.size() == value.size()
+
+        dim_per_head = self.dim // self.n_heads
+
+        mask_reshp = (bs, 1, 1, k_length)
+
+        def shape(x: torch.Tensor) -> torch.Tensor:
+            """separate heads"""
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x: torch.Tensor) -> torch.Tensor:
+            """group heads"""
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
+
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
+        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
+        scores = scores.masked_fill(
+            mask, torch.tensor(torch.finfo(scores.dtype).min)
+        )  # (bs, n_heads, q_length, k_length)
+
+        weights = nn.functional.softmax(scores, dim=-1)  # (bs, n_heads, q_length, k_length)
+        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
+        context = unshape(context)  # (bs, q_length, dim)
+        context = self.out_lin(context)  # (bs, q_length, dim)
+
+        if output_attentions:
+            return (context, weights)
+        else:
+            return (context,)
+
+
+class DistilBertFlashAttention2(MultiHeadSelfAttention):
+    """
+    DistilBert flash attention module. This module inherits from `MultiHeadSelfAttention` as the weights of the module
+    stays untouched. The only required change would be on the forward pass where it needs to correctly call the public
+    API of flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Parameters:
+            query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
+
+        Returns:
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        batch_size, q_length, dim = query.size()
+
+        dim_per_head = self.dim // self.n_heads
+
+        def reshape(x: torch.Tensor) -> torch.Tensor:
+            """separate heads"""
+            return x.view(batch_size, -1, self.n_heads, dim_per_head)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        query_states = reshape(self.q_lin(query))
+        key_states = reshape(self.k_lin(key))
+        value_states = reshape(self.v_lin(value))
+
+        attn_dropout = self.config.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_lin.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_weights = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            mask,
+            q_length,
+            dropout=attn_dropout,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_weights_reshaped = attn_weights.reshape(batch_size, q_length, self.n_heads * dim_per_head)
+        attn_output = self.out_lin(attn_weights_reshaped)
+
+        if output_attentions:
+            return (attn_output, attn_weights)
+        else:
+            return (attn_output,)
+
+
+class DistilBertSdpaAttention(MultiHeadSelfAttention):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config=config)
+        self.dropout_prob = config.attention_dropout
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Parameters:
+            query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
+
+        Returns:
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        if output_attentions or head_mask is not None:
+            logger.warning_once(
+                "DistilBertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support"
+                " `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying"
+                " the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be"
+                ' removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                query,
+                key,
+                value,
+                mask,
+                head_mask,
+                output_attentions,
+            )
+
+        batch_size, _, _ = query.size()
+        dim_per_head = self.dim // self.n_heads
+
+        def shape(x: torch.Tensor) -> torch.Tensor:
+            """separate heads"""
+            return x.view(batch_size, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x: torch.Tensor) -> torch.Tensor:
+            """group heads"""
+            return x.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=False,
+        )
+
+        attn_output = unshape(attn_output)
+        attn_output = self.out_lin(attn_output)
+
+        return (attn_output,)
+
+
+class FFN(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.dropout = nn.Dropout(p=config.dropout)
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
+        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
+        self.activation = get_activation(config.activation)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
+
+    def ff_chunk(self, input: torch.Tensor) -> torch.Tensor:
+        x = self.lin1(input)
+        x = self.activation(x)
+        x = self.lin2(x)
+        x = self.dropout(x)
+        return x
+
+
+DISTILBERT_ATTENTION_CLASSES = {
+    "eager": MultiHeadSelfAttention,
+    "flash_attention_2": DistilBertFlashAttention2,
+    "sdpa": DistilBertSdpaAttention,
+}
+
+
+class TransformerBlock(GradientCheckpointingLayer):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        # Have an even number of Configure multi-heads
+        if config.dim % config.n_heads != 0:
+            raise ValueError(f"config.n_heads {config.n_heads} must divide config.dim {config.dim} evenly")
+
+        self.attention = DISTILBERT_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
+
+        self.ffn = FFN(config)
+        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, ...]:
+        """
+        Parameters:
+            x: torch.tensor(bs, seq_length, dim)
+            attn_mask: torch.tensor(bs, seq_length)
+
+        Returns:
+            sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output:
+            torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
+        """
+        # Self-Attention
+        sa_output = self.attention(
+            query=x,
+            key=x,
+            value=x,
+            mask=attn_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        if output_attentions:
+            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else:  # To handle these `output_attentions` or `output_hidden_states` cases returning tuples
+            if type(sa_output) is not tuple:
+                raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type")
+
+            sa_output = sa_output[0]
+        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
+
+        # Feed Forward Network
+        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
+        ffn_output: torch.Tensor = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
+
+        output = (ffn_output,)
+        if output_attentions:
+            output = (sa_weights,) + output
+        return output
+
+
+class Transformer(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.n_layers = config.n_layers
+        self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]:  # docstyle-ignore
+        """
+        Parameters:
+            x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
+            attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
+
+        Returns:
+            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
+            layer all_hidden_states: tuple[torch.tensor(bs, seq_length, dim)]
+                Tuple of length n_layers with the hidden states from each layer.
+                Optional: only if output_hidden_states=True
+            all_attentions: tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
+                Tuple of length n_layers with the attention weights from each layer
+                Optional: only if output_attentions=True
+        """
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_state = x
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+            layer_outputs = layer_module(
+                hidden_state,
+                attn_mask,
+                head_mask[i],
+                output_attentions,
+            )
+
+            hidden_state = layer_outputs[-1]
+
+            if output_attentions:
+                if len(layer_outputs) != 2:
+                    raise ValueError(f"The length of the layer_outputs should be 2, but it is {len(layer_outputs)}")
+
+                attentions = layer_outputs[0]
+                all_attentions = all_attentions + (attentions,)
+            else:
+                if len(layer_outputs) != 1:
+                    raise ValueError(f"The length of the layer_outputs should be 1, but it is {len(layer_outputs)}")
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
+@auto_docstring
+class DistilBertPreTrainedModel(PreTrainedModel):
+    config: DistilBertConfig
+    load_tf_weights = None
+    base_model_prefix = "distilbert"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Embeddings) and self.config.sinusoidal_pos_embds:
+            create_sinusoidal_embeddings(
+                self.config.max_position_embeddings, self.config.dim, module.position_embeddings.weight
+            )
+
+
+@auto_docstring
+class DistilBertModel(DistilBertPreTrainedModel):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+
+        self.embeddings = Embeddings(config)  # Embeddings
+        self.transformer = Transformer(config)  # Encoder
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.embeddings.position_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
+                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
+                the size will remove vectors from the end.
+        """
+        num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings
+
+        # no resizing needs to be done if the length stays the same
+        if num_position_embeds_diff == 0:
+            return
+
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config.max_position_embeddings = new_num_position_embeddings
+
+        old_position_embeddings_weight = self.embeddings.position_embeddings.weight.clone()
+
+        self.embeddings.position_embeddings = nn.Embedding(self.config.max_position_embeddings, self.config.dim)
+
+        if self.config.sinusoidal_pos_embds:
+            create_sinusoidal_embeddings(
+                n_pos=self.config.max_position_embeddings, dim=self.config.dim, out=self.position_embeddings.weight
+            )
+        else:
+            with torch.no_grad():
+                if num_position_embeds_diff > 0:
+                    self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = nn.Parameter(
+                        old_position_embeddings_weight
+                    )
+                else:
+                    self.embeddings.position_embeddings.weight = nn.Parameter(
+                        old_position_embeddings_weight[:num_position_embeds_diff]
+                    )
+        # move position_embeddings to correct device
+        self.embeddings.position_embeddings.to(self.device)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: nn.Embedding):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[list[int]]]):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.transformer.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        head_mask_is_none = head_mask is None
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embeddings = self.embeddings(input_ids, inputs_embeds)  # (bs, seq_length, dim)
+
+        if self._use_flash_attention_2:
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            if attention_mask is None:
+                attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)
+
+            if self._use_sdpa and head_mask_is_none and not output_attentions:
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embeddings.dtype, tgt_len=input_shape[1]
+                )
+
+        return self.transformer(
+            x=embeddings,
+            attn_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    DistilBert Model with a `masked language modeling` head on top.
+    """
+)
+class DistilBertForMaskedLM(DistilBertPreTrainedModel):
+    _tied_weights_keys = ["vocab_projector.weight"]
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+
+        self.activation = get_activation(config.activation)
+
+        self.distilbert = DistilBertModel(config)
+        self.vocab_transform = nn.Linear(config.dim, config.dim)
+        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
+        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.mlm_loss_fct = nn.CrossEntropyLoss()
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.distilbert.get_position_embeddings()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
+                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
+                the size will remove vectors from the end.
+        """
+        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.vocab_projector
+
+    def set_output_embeddings(self, new_embeddings: nn.Module):
+        self.vocab_projector = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MaskedLMOutput, tuple[torch.Tensor, ...]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        dlbrt_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
+        prediction_logits = self.activation(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
+
+        mlm_loss = None
+        if labels is not None:
+            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_logits,) + dlbrt_output[1:]
+            return ((mlm_loss,) + output) if mlm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=mlm_loss,
+            logits=prediction_logits,
+            hidden_states=dlbrt_output.hidden_states,
+            attentions=dlbrt_output.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """
+)
+class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.distilbert = DistilBertModel(config)
+        self.pre_classifier = nn.Linear(config.dim, config.dim)
+        self.classifier = nn.Linear(config.dim, config.num_labels)
+        self.dropout = nn.Dropout(config.seq_classif_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.distilbert.get_position_embeddings()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
+                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
+                the size will remove vectors from the end.
+        """
+        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, tuple[torch.Tensor, ...]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output)  # (bs, dim)
+        logits = self.classifier(pooled_output)  # (bs, num_labels)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + distilbert_output[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+
+@auto_docstring
+class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+
+        self.distilbert = DistilBertModel(config)
+        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
+        if config.num_labels != 2:
+            raise ValueError(f"config.num_labels should be 2, but it is {config.num_labels}")
+
+        self.dropout = nn.Dropout(config.qa_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.distilbert.get_position_embeddings()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
+                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
+                the size will remove vectors from the end.
+        """
+        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[QuestionAnsweringModelOutput, tuple[torch.Tensor, ...]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+
+        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + distilbert_output[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+
+@auto_docstring
+class DistilBertForTokenClassification(DistilBertPreTrainedModel):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.distilbert = DistilBertModel(config)
+        self.dropout = nn.Dropout(config.dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.distilbert.get_position_embeddings()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
+                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
+                the size will remove vectors from the end.
+        """
+        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[TokenClassifierOutput, tuple[torch.Tensor, ...]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.distilbert(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+
+        self.distilbert = DistilBertModel(config)
+        self.pre_classifier = nn.Linear(config.dim, config.dim)
+        self.classifier = nn.Linear(config.dim, 1)
+        self.dropout = nn.Dropout(config.seq_classif_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.distilbert.get_position_embeddings()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`)
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MultipleChoiceModelOutput, tuple[torch.Tensor, ...]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DistilBertForMultipleChoice
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+        >>> model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-cased")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+
+        >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors="pt", padding=True)
+        >>> outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.distilbert(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
+        pooled_output = self.dropout(pooled_output)  # (bs * num_choices, dim)
+        logits = self.classifier(pooled_output)  # (bs * num_choices, 1)
+
+        reshaped_logits = logits.view(-1, num_choices)  # (bs, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "DistilBertForMaskedLM",
+    "DistilBertForMultipleChoice",
+    "DistilBertForQuestionAnswering",
+    "DistilBertForSequenceClassification",
+    "DistilBertForTokenClassification",
+    "DistilBertModel",
+    "DistilBertPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py b/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba3dfd9d332d5f34dfea23a20cec7a7871de9e4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py
@@ -0,0 +1,906 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Callable, Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_distilbert import DistilBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
+_CONFIG_FOR_DOC = "DistilBertConfig"
+
+
+FLAX_DISTILBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DISTILBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def get_angles(pos, i, d_model):
+    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
+    return pos * angle_rates
+
+
+def positional_encoding(position, d_model):
+    # create the sinusoidal pattern for the positional encoding
+    angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
+
+    # apply sin to even indices in the array; 2i
+    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+
+    # apply cos to odd indices in the array; 2i+1
+    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+
+    pos_encoding = angle_rads[np.newaxis, ...]
+
+    return jnp.array(pos_encoding)
+
+
+class FlaxEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.dim,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        if not self.config.sinusoidal_pos_embds:
+            self.position_embeddings = nn.Embed(
+                self.config.max_position_embeddings,
+                self.config.dim,
+                embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            )
+        else:
+            self.pos_encoding = positional_encoding(self.config.max_position_embeddings, self.config.dim)
+        self.LayerNorm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.dropout)
+
+    def __call__(self, input_ids, deterministic: bool = True):
+        # Embed
+        batch_size, seq_length = input_ids.shape
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        if not self.config.sinusoidal_pos_embds:
+            position_ids = jnp.arange(seq_length).astype("i4")
+            position_ids = jnp.broadcast_to(position_ids, shape=(batch_size, seq_length))
+            position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        else:
+            position_embeds = self.pos_encoding[:, :seq_length, :]
+            # explicitly cast the positions here, since self.embed_positions are not registered as parameters
+            position_embeds = position_embeds.astype(inputs_embeds.dtype)
+
+        # Sum all embeddings
+        hidden_states = inputs_embeds + position_embeds
+
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxMultiHeadSelfAttention(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.n_heads = self.config.n_heads
+        self.dim = self.config.dim
+        self.dropout = nn.Dropout(rate=self.config.attention_dropout)
+
+        if not (self.dim % self.n_heads == 0):
+            raise ValueError(f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}")
+
+        self.q_lin = nn.Dense(
+            self.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.k_lin = nn.Dense(
+            self.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.v_lin = nn.Dense(
+            self.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.out_lin = nn.Dense(
+            self.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        query,
+        key,
+        value,
+        mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        bs, q_len, dim = query.shape
+        k_len = key.shape[1]
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        # assert key.size() == value.size()
+
+        dim_per_head = self.dim // self.n_heads
+
+        mask_reshp = (bs, 1, 1, k_len)
+
+        def shape(x):
+            """separate heads"""
+            return x.reshape(bs, -1, self.n_heads, dim_per_head).transpose(0, 2, 1, 3)
+
+        def unshape(x):
+            """group heads"""
+            return x.transpose(0, 2, 1, 3).reshape(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_len, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_len, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_len, dim_per_head)
+
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_len, dim_per_head)
+        scores = jnp.matmul(q, k.transpose(0, 1, 3, 2))  # (bs, n_heads, q_len, k_len)
+        mask = jnp.reshape(mask, mask_reshp)
+
+        mask = mask.astype(scores.dtype)
+        scores = scores - 1e30 * (1.0 - mask)
+
+        weights = nn.softmax(scores, axis=-1)  # (bs, n_heads, q_len, k_len)
+        weights = self.dropout(weights, deterministic=deterministic)
+
+        context = jnp.matmul(weights, v)  # (bs, n_heads, q_len, dim_per_head)
+        context = unshape(context)  # (bs, q_len, dim)
+        context = self.out_lin(context)  # (bs, q_len, dim)
+
+        if output_attentions:
+            return (context, weights)
+        else:
+            return (context,)
+
+
+class FlaxFFN(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dropout = nn.Dropout(rate=self.config.dropout)
+        self.chunk_size_feed_forward = self.config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.lin1 = nn.Dense(
+            self.config.hidden_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.lin2 = nn.Dense(
+            self.config.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+        self.activation = ACT2FN[self.config.activation]
+
+    def __call__(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxTransformerBlock(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        assert self.config.dim % self.config.n_heads == 0, (
+            f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"
+        )
+
+        self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype)
+        self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
+
+        self.ffn = FlaxFFN(self.config, dtype=self.dtype)
+        self.output_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attn_mask,
+        output_attentions: bool = False,
+        deterministic: bool = True,
+    ):
+        # Self-Attention
+        sa_output = self.attention(
+            query=hidden_states,
+            key=hidden_states,
+            value=hidden_states,
+            mask=attn_mask,
+            output_attentions=output_attentions,
+            deterministic=deterministic,
+        )
+        if output_attentions:
+            sa_output, sa_weights = sa_output
+        else:
+            assert type(sa_output) is tuple
+            sa_output = sa_output[0]
+        sa_output = self.sa_layer_norm(sa_output + hidden_states)
+
+        # Feed Forward Network
+        ffn_output = self.ffn(sa_output, deterministic=deterministic)
+        ffn_output = self.output_layer_norm(ffn_output + sa_output)
+        output = (ffn_output,)
+        if output_attentions:
+            output = (sa_weights,) + output
+        return output
+
+
+class FlaxTransformer(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxTransformerBlock(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.n_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        deterministic: bool = True,
+        return_dict: bool = False,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for layer_module in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attn_mask=attention_mask,
+                output_attentions=output_attentions,
+                deterministic=deterministic,
+            )
+            hidden_states = layer_outputs[-1]
+
+            if output_attentions:
+                assert len(layer_outputs) == 2
+                attentions = layer_outputs[0]
+                all_attentions = all_attentions + (attentions,)
+            else:
+                assert len(layer_outputs) == 1
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_attentions, all_hidden_states] if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class FlaxTransformerEncoder(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layer = FlaxTransformer(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        deterministic: bool = True,
+        return_dict: bool = False,
+    ):
+        return self.layer(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            deterministic=deterministic,
+            return_dict=return_dict,
+        )
+
+
+class FlaxDistilBertLMDecoder(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+
+    def __call__(self, inputs, kernel):
+        inputs = jnp.asarray(inputs, self.dtype)
+        kernel = jnp.asarray(kernel, self.dtype)
+        y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())))
+        bias = jnp.asarray(self.bias, self.dtype)
+        y = y + bias
+        return y
+
+
+class FlaxDistilBertPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DistilBertConfig
+    base_model_prefix = "distilbert"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: DistilBertConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        head_mask=None,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            not train,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+        )
+
+
+class FlaxDistilBertModule(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embeddings = FlaxEmbeddings(self.config, dtype=self.dtype)
+        self.transformer = FlaxTransformerEncoder(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        input_embeds = self.embeddings(input_ids, deterministic=deterministic)
+        return self.transformer(
+            hidden_states=input_embeds,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(
+    "The bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAX_DISTILBERT_START_DOCSTRING,
+)
+class FlaxDistilBertModel(FlaxDistilBertPreTrainedModel):
+    module_class = FlaxDistilBertModule
+
+
+append_call_sample_docstring(FlaxDistilBertModel, _CHECKPOINT_FOR_DOC, None, _CONFIG_FOR_DOC)
+
+
+class FlaxDistilBertForMaskedLMModule(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.distilbert = FlaxDistilBertModule(self.config, dtype=self.dtype)
+        self.vocab_transform = nn.Dense(
+            self.config.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.vocab_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
+        if self.config.tie_word_embeddings:
+            self.vocab_projector = FlaxDistilBertLMDecoder(
+                self.config,
+                dtype=self.dtype,
+            )
+        else:
+            self.vocab_projector = nn.Dense(
+                self.config.vocab_size,
+                dtype=self.dtype,
+                kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        dlbrt_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            deterministic=deterministic,
+            return_dict=return_dict,
+        )
+        hidden_states = dlbrt_output[0]
+        prediction_logits = self.vocab_transform(hidden_states)
+        prediction_logits = ACT2FN[self.config.activation](prediction_logits)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.distilbert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+            prediction_logits = self.vocab_projector(prediction_logits, shared_embedding.T)
+        else:
+            prediction_logits = self.vocab_projector(prediction_logits)
+
+        if not return_dict:
+            output = (prediction_logits,) + dlbrt_output[1:]
+            return output
+
+        return FlaxMaskedLMOutput(
+            logits=prediction_logits,
+            hidden_states=dlbrt_output.hidden_states,
+            attentions=dlbrt_output.attentions,
+        )
+
+
+@add_start_docstrings("""DistilBert Model with a `language modeling` head on top.""", FLAX_DISTILBERT_START_DOCSTRING)
+class FlaxDistilBertForMaskedLM(FlaxDistilBertPreTrainedModel):
+    module_class = FlaxDistilBertForMaskedLMModule
+
+
+append_call_sample_docstring(FlaxDistilBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxDistilBertForSequenceClassificationModule(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
+        self.pre_classifier = nn.Dense(
+            self.config.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout)
+        self.classifier = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Model
+        distilbert_output = self.distilbert(
+            input_ids,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = ACT2FN["relu"](pooled_output)
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)  # (bs, dim)
+
+        if not return_dict:
+            return (logits,) + distilbert_output[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    FLAX_DISTILBERT_START_DOCSTRING,
+)
+class FlaxDistilBertForSequenceClassification(FlaxDistilBertPreTrainedModel):
+    module_class = FlaxDistilBertForSequenceClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxDistilBertForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxDistilBertForMultipleChoiceModule(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
+        self.pre_classifier = nn.Dense(
+            self.config.dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout)
+        self.classifier = nn.Dense(
+            1,
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+
+        # Model
+        outputs = self.distilbert(
+            input_ids,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_state = outputs[0]
+        pooled_output = hidden_state[:, 0]
+        pooled_output = self.pre_classifier(pooled_output)
+        pooled_output = ACT2FN["relu"](pooled_output)
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    FLAX_DISTILBERT_START_DOCSTRING,
+)
+class FlaxDistilBertForMultipleChoice(FlaxDistilBertPreTrainedModel):
+    module_class = FlaxDistilBertForMultipleChoiceModule
+
+
+overwrite_call_docstring(
+    FlaxDistilBertForMultipleChoice, DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxDistilBertForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxDistilBertForTokenClassificationModule(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Model
+        outputs = self.distilbert(
+            input_ids,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    FLAX_DISTILBERT_START_DOCSTRING,
+)
+class FlaxDistilBertForTokenClassification(FlaxDistilBertPreTrainedModel):
+    module_class = FlaxDistilBertForTokenClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxDistilBertForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxDistilBertForQuestionAnsweringModule(nn.Module):
+    config: DistilBertConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+        assert self.config.num_labels == 2
+        self.dropout = nn.Dropout(rate=self.config.qa_dropout)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Model
+        distilbert_output = self.distilbert(
+            input_ids,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = distilbert_output[0]
+
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if not return_dict:
+            return (start_logits, end_logits) + distilbert_output[1:]
+
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FLAX_DISTILBERT_START_DOCSTRING,
+)
+class FlaxDistilBertForQuestionAnswering(FlaxDistilBertPreTrainedModel):
+    module_class = FlaxDistilBertForQuestionAnsweringModule
+
+
+append_call_sample_docstring(
+    FlaxDistilBertForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = [
+    "FlaxDistilBertForMaskedLM",
+    "FlaxDistilBertForMultipleChoice",
+    "FlaxDistilBertForQuestionAnswering",
+    "FlaxDistilBertForSequenceClassification",
+    "FlaxDistilBertForTokenClassification",
+    "FlaxDistilBertModel",
+    "FlaxDistilBertPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py b/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2efa1105c1cde9236e9c32627a5773cb38cc862
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -0,0 +1,1146 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 DistilBERT model
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_distilbert import DistilBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
+_CONFIG_FOR_DOC = "DistilBertConfig"
+
+
+class TFEmbeddings(keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.dim = config.dim
+        self.initializer_range = config.initializer_range
+        self.max_position_embeddings = config.max_position_embeddings
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.dropout)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.dim],
+                initializer=get_initializer(initializer_range=self.initializer_range),
+            )
+
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.dim],
+                initializer=get_initializer(initializer_range=self.initializer_range),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.dim])
+
+    def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        final_embeddings = inputs_embeds + position_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+class TFMultiHeadSelfAttention(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.dropout = keras.layers.Dropout(config.attention_dropout)
+        self.output_attentions = config.output_attentions
+
+        assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}"
+
+        self.q_lin = keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
+        )
+        self.k_lin = keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
+        )
+        self.v_lin = keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
+        )
+        self.out_lin = keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
+        )
+
+        self.pruned_heads = set()
+        self.config = config
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, query, key, value, mask, head_mask, output_attentions, training=False):
+        """
+        Parameters:
+            query: tf.Tensor(bs, seq_length, dim)
+            key: tf.Tensor(bs, seq_length, dim)
+            value: tf.Tensor(bs, seq_length, dim)
+            mask: tf.Tensor(bs, seq_length)
+
+        Returns:
+            weights: tf.Tensor(bs, n_heads, seq_length, seq_length) Attention weights context: tf.Tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        bs, q_length, dim = shape_list(query)
+        k_length = shape_list(key)[1]
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        # assert key.size() == value.size()
+        dim_per_head = int(self.dim / self.n_heads)
+        dim_per_head = tf.cast(dim_per_head, dtype=tf.int32)
+        mask_reshape = [bs, 1, 1, k_length]
+
+        def shape(x):
+            """separate heads"""
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """group heads"""
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
+        q = tf.cast(q, dtype=tf.float32)
+        q = tf.multiply(q, tf.math.rsqrt(tf.cast(dim_per_head, dtype=tf.float32)))
+        k = tf.cast(k, dtype=q.dtype)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, q_length, k_length)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+
+        mask = tf.cast(mask, dtype=scores.dtype)
+        scores = scores - 1e30 * (1.0 - mask)
+        weights = stable_softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, q_length, dim)
+        context = self.out_lin(context)  # (bs, q_length, dim)
+
+        if output_attentions:
+            return (context, weights)
+        else:
+            return (context,)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_lin", None) is not None:
+            with tf.name_scope(self.q_lin.name):
+                self.q_lin.build([None, None, self.config.dim])
+        if getattr(self, "k_lin", None) is not None:
+            with tf.name_scope(self.k_lin.name):
+                self.k_lin.build([None, None, self.config.dim])
+        if getattr(self, "v_lin", None) is not None:
+            with tf.name_scope(self.v_lin.name):
+                self.v_lin.build([None, None, self.config.dim])
+        if getattr(self, "out_lin", None) is not None:
+            with tf.name_scope(self.out_lin.name):
+                self.out_lin.build([None, None, self.config.dim])
+
+
+class TFFFN(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.lin1 = keras.layers.Dense(
+            config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
+        )
+        self.lin2 = keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
+        )
+        self.activation = get_tf_activation(config.activation)
+        self.config = config
+
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.activation(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "lin1", None) is not None:
+            with tf.name_scope(self.lin1.name):
+                self.lin1.build([None, None, self.config.dim])
+        if getattr(self, "lin2", None) is not None:
+            with tf.name_scope(self.lin2.name):
+                self.lin2.build([None, None, self.config.hidden_dim])
+
+
+class TFTransformerBlock(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.hidden_dim = config.hidden_dim
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation = config.activation
+        self.output_attentions = config.output_attentions
+
+        assert config.dim % config.n_heads == 0, (
+            f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
+        )
+
+        self.attention = TFMultiHeadSelfAttention(config, name="attention")
+        self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
+
+        self.ffn = TFFFN(config, name="ffn")
+        self.output_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
+        self.config = config
+
+    def call(self, x, attn_mask, head_mask, output_attentions, training=False):  # removed: src_enc=None, src_len=None
+        """
+        Parameters:
+            x: tf.Tensor(bs, seq_length, dim)
+            attn_mask: tf.Tensor(bs, seq_length)
+
+        Outputs: sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output:
+        tf.Tensor(bs, seq_length, dim) The output of the transformer block contextualization.
+        """
+        # Self-Attention
+        sa_output = self.attention(x, x, x, attn_mask, head_mask, output_attentions, training=training)
+        if output_attentions:
+            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else:  # To handle these `output_attentions` or `output_hidden_states` cases returning tuples
+            # assert type(sa_output) == tuple
+            sa_output = sa_output[0]
+        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
+
+        # Feed Forward Network
+        ffn_output = self.ffn(sa_output, training=training)  # (bs, seq_length, dim)
+        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
+
+        output = (ffn_output,)
+        if output_attentions:
+            output = (sa_weights,) + output
+        return output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "sa_layer_norm", None) is not None:
+            with tf.name_scope(self.sa_layer_norm.name):
+                self.sa_layer_norm.build([None, None, self.config.dim])
+        if getattr(self, "ffn", None) is not None:
+            with tf.name_scope(self.ffn.name):
+                self.ffn.build(None)
+        if getattr(self, "output_layer_norm", None) is not None:
+            with tf.name_scope(self.output_layer_norm.name):
+                self.output_layer_norm.build([None, None, self.config.dim])
+
+
+class TFTransformer(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.n_layers = config.n_layers
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+
+        self.layer = [TFTransformerBlock(config, name=f"layer_._{i}") for i in range(config.n_layers)]
+
+    def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False):
+        # docstyle-ignore
+        """
+        Parameters:
+            x: tf.Tensor(bs, seq_length, dim) Input sequence embedded.
+            attn_mask: tf.Tensor(bs, seq_length) Attention mask on the sequence.
+
+        Returns:
+            hidden_state: tf.Tensor(bs, seq_length, dim)
+                Sequence of hidden states in the last (top) layer
+            all_hidden_states: tuple[tf.Tensor(bs, seq_length, dim)]
+                Tuple of length n_layers with the hidden states from each layer.
+                Optional: only if output_hidden_states=True
+            all_attentions: tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
+                Tuple of length n_layers with the attention weights from each layer
+                Optional: only if output_attentions=True
+        """
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_state = x
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+            layer_outputs = layer_module(hidden_state, attn_mask, head_mask[i], output_attentions, training=training)
+            hidden_state = layer_outputs[-1]
+
+            if output_attentions:
+                assert len(layer_outputs) == 2
+                attentions = layer_outputs[0]
+                all_attentions = all_attentions + (attentions,)
+            else:
+                assert len(layer_outputs) == 1, f"Incorrect number of outputs {len(layer_outputs)} instead of 1"
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFDistilBertMainLayer(keras.layers.Layer):
+    config_class = DistilBertConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.num_hidden_layers = config.num_hidden_layers
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+
+        self.embeddings = TFEmbeddings(config, name="embeddings")  # Embeddings
+        self.transformer = TFTransformer(config, name="transformer")  # Encoder
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = value.shape[0]
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.ones(input_shape)  # (bs, seq_length)
+
+        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
+        tfmr_output = self.transformer(
+            embedding_output,
+            attention_mask,
+            head_mask,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            training=training,
+        )
+
+        return tfmr_output  # last-layer hidden-state, (all hidden_states), (all attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
+class TFDistilBertPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DistilBertConfig
+    base_model_prefix = "distilbert"
+
+
+DISTILBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DISTILBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
+    DISTILBERT_START_DOCSTRING,
+)
+class TFDistilBertModel(TFDistilBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        outputs = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "distilbert", None) is not None:
+            with tf.name_scope(self.distilbert.name):
+                self.distilbert.build(None)
+
+
+class TFDistilBertLMHead(keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.dim = config.dim
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim])
+        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """DistilBert Model with a `masked language modeling` head on top.""",
+    DISTILBERT_START_DOCSTRING,
+)
+class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.config = config
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.vocab_transform = keras.layers.Dense(
+            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
+        )
+        self.act = get_tf_activation(config.activation)
+        self.vocab_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
+        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
+
+    def get_lm_head(self):
+        return self.vocab_projector
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.vocab_projector.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = distilbert_output[0]  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
+        prediction_logits = self.act(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_projector(prediction_logits)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, prediction_logits)
+
+        if not return_dict:
+            output = (prediction_logits,) + distilbert_output[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "distilbert", None) is not None:
+            with tf.name_scope(self.distilbert.name):
+                self.distilbert.build(None)
+        if getattr(self, "vocab_transform", None) is not None:
+            with tf.name_scope(self.vocab_transform.name):
+                self.vocab_transform.build([None, None, self.config.dim])
+        if getattr(self, "vocab_layer_norm", None) is not None:
+            with tf.name_scope(self.vocab_layer_norm.name):
+                self.vocab_layer_norm.build([None, None, self.config.dim])
+        if getattr(self, "vocab_projector", None) is not None:
+            with tf.name_scope(self.vocab_projector.name):
+                self.vocab_projector.build(None)
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    DISTILBERT_START_DOCSTRING,
+)
+class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.pre_classifier = keras.layers.Dense(
+            config.dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="relu",
+            name="pre_classifier",
+        )
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.dropout = keras.layers.Dropout(config.seq_classif_dropout)
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=training)  # (bs, dim)
+        logits = self.classifier(pooled_output)  # (bs, dim)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + distilbert_output[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "distilbert", None) is not None:
+            with tf.name_scope(self.distilbert.name):
+                self.distilbert.build(None)
+        if getattr(self, "pre_classifier", None) is not None:
+            with tf.name_scope(self.pre_classifier.name):
+                self.pre_classifier.build([None, None, self.config.dim])
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.dim])
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    DISTILBERT_START_DOCSTRING,
+)
+class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "distilbert", None) is not None:
+            with tf.name_scope(self.distilbert.name):
+                self.distilbert.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    DISTILBERT_START_DOCSTRING,
+)
+class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.dropout = keras.layers.Dropout(config.seq_classif_dropout)
+        self.pre_classifier = keras.layers.Dense(
+            config.dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="relu",
+            name="pre_classifier",
+        )
+        self.classifier = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+        distilbert_output = self.distilbert(
+            flat_input_ids,
+            flat_attention_mask,
+            head_mask,
+            flat_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=training)  # (bs, dim)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + distilbert_output[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "distilbert", None) is not None:
+            with tf.name_scope(self.distilbert.name):
+                self.distilbert.build(None)
+        if getattr(self, "pre_classifier", None) is not None:
+            with tf.name_scope(self.pre_classifier.name):
+                self.pre_classifier.build([None, None, self.config.dim])
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.dim])
+
+
+@add_start_docstrings(
+    """
+    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    DISTILBERT_START_DOCSTRING,
+)
+class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2"
+        self.dropout = keras.layers.Dropout(config.qa_dropout)
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states, training=training)  # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + distilbert_output[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "distilbert", None) is not None:
+            with tf.name_scope(self.distilbert.name):
+                self.distilbert.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.dim])
+
+
+__all__ = [
+    "TFDistilBertForMaskedLM",
+    "TFDistilBertForMultipleChoice",
+    "TFDistilBertForQuestionAnswering",
+    "TFDistilBertForSequenceClassification",
+    "TFDistilBertForTokenClassification",
+    "TFDistilBertMainLayer",
+    "TFDistilBertModel",
+    "TFDistilBertPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/tokenization_distilbert.py b/phivenv/Lib/site-packages/transformers/models/distilbert/tokenization_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e44468ab1d574666053b0ba772f2f0e098d5263
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/distilbert/tokenization_distilbert.py
@@ -0,0 +1,492 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DistilBERT."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class DistilBertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a DistilBERT tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        clean_up_tokenization_spaces=True,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    @property
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.vocab)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
+    def _tokenize(self, text, split_special_tokens=False):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["DistilBertTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py b/phivenv/Lib/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..c174804dc530c863b14c3ab6e56c18117d3fa4c2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DistilBERT."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_distilbert import DistilBertTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class DistilBertTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = DistilBertTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["DistilBertTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dit/__init__.py b/phivenv/Lib/site-packages/transformers/models/dit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/phivenv/Lib/site-packages/transformers/models/dit/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f71268cb7f0d87b6c9966dec51f59f7c9ef4492e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/__init__.py b/phivenv/Lib/site-packages/transformers/models/doge/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91aeca0340f162439b6b7485f325f5343802dd6e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/doge/__init__.py
@@ -0,0 +1,28 @@
+# coding=utf-8
+# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_doge import *
+    from .modeling_doge import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9aa0daa84817d8e2beea656100c4287cd56193f9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79346a32d48bdf075735bd02767bfd513d608d1a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..816fc9e6a35eaa1c9772c12ef507dcb1b9a7364b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34213efe8025a43516201953ff2887924448cd07
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/configuration_doge.py b/phivenv/Lib/site-packages/transformers/models/doge/configuration_doge.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a93fa198f2073ca0e34e613a7d9bed01f892d0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/doge/configuration_doge.py
@@ -0,0 +1,241 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/doge/modular_doge.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_doge.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
+#
+# The Doge family of small language models is trained by SmallDoge Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class DogeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
+    model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32768):
+            Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for each sequence transformation and state transformation module.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
+            Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
+                    In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'.
+                    The original max position embeddings used during pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation.
+                    If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention.
+            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
+            For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
+            If it is not specified, will default to `num_attention_heads`.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        keep_window_size (`int`, *optional*, defaults to 2048):
+            The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
+        is_moe (`bool`, *optional*, defaults to `False`):
+            Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
+        num_experts (`int`, *optional*, defaults to 16384):
+            Number of routed experts in the model. This is only used when `is_moe=True`.
+        num_experts_per_tok (`int`, *optional*, defaults to 64):
+            Number of selected experts to route per-token.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import DogeConfig, DogeModel
+
+    >>> # Initializing a Doge-320M style configuration
+    >>> configuration = DogeConfig()
+
+    >>> # Initializing a model from the Doge-320M style configuration
+    >>> model = DogeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "doge"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `DogeModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.dt_proj": "rowwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.input_layernorm.weight": "sequence_parallel",
+        "layers.*.input_residual.weight": "sequence_parallel",
+        "layers.*.post_attention_layernorm.weight": "sequence_parallel",
+        "layers.*.post_attention_residual.weight": "sequence_parallel",
+        "norm.weight": "sequence_parallel",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+        "layers.*.mlp.router_gate": "colwise_rep",
+        "layers.*.mlp.down_embed": "rowwise_rep",
+        "layers.*.mlp.up_embed": "rowwise_rep",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32768,
+        hidden_size=1024,
+        intermediate_size=2048,
+        num_hidden_layers=32,
+        hidden_dropout=0.0,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        tie_word_embeddings=False,
+        max_position_embeddings=2048,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        num_attention_heads=8,
+        num_key_value_heads=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        sliding_window=None,
+        keep_window_size=2048,
+        is_moe=False,
+        num_experts=16384,
+        num_experts_per_tok=64,
+        norm_topk_prob=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+
+        self.hidden_dropout = hidden_dropout
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.sliding_window = sliding_window
+        self.keep_window_size = keep_window_size
+        self.is_moe = is_moe
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            self.num_key_value_heads = num_attention_heads
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["DogeConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/modeling_doge.py b/phivenv/Lib/site-packages/transformers/models/doge/modeling_doge.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94667398507e8b8a57b451e0a4573dfa69cb9d8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/doge/modeling_doge.py
@@ -0,0 +1,812 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/doge/modular_doge.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_doge.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
+#
+# The Doge family of small language models is trained by SmallDoge Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...integrations.flex_attention import compile_friendly_flex_attention
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import AttentionInterface, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder, check_model_inputs
+from .configuration_doge import DogeConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DogeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DogeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class DogeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: DogeConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def flex_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Union[torch.Tensor, "BlockMask"],
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    block_mask = None
+    causal_mask = None
+    if isinstance(attention_mask, BlockMask):
+        block_mask = attention_mask
+    else:
+        causal_mask = attention_mask
+
+    if causal_mask is not None:
+        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+
+    def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
+        if softcap is not None:
+            score = softcap * torch.tanh(score / softcap)
+        if causal_mask is not None:
+            score = score + causal_mask[batch_idx][head_idx][q_idx][kv_idx]
+        if head_mask is not None:
+            score = score + head_mask[batch_idx][head_idx][0][0]
+        return score
+
+    attn_output, attention_weights = compile_friendly_flex_attention(
+        query,
+        key,
+        value,
+        score_mod=score_mod,
+        block_mask=block_mask,
+        enable_gqa=True,
+        scale=scaling,
+        # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless.
+        # For simplification, we thus always return it as no additional computations are introduced.
+        return_lse=True,
+    )
+    # lse is returned in float32
+    attention_weights = attention_weights.to(value.dtype)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attention_weights
+
+
+ALL_ATTENTION_FUNCTIONS = AttentionInterface()
+ALL_ATTENTION_FUNCTIONS["doge_flex_attention"] = flex_attention_forward
+
+
+class DogeAttention(nn.Module):
+    def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.keep_window_size = config.keep_window_size
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        # dynamic mask for the QK^T attention weights matrix
+        self.A = nn.Parameter(torch.zeros(config.num_key_value_heads))
+        self.dt_proj = nn.Linear(
+            config.num_key_value_heads * self.head_dim, config.num_key_value_heads, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # calculate dynamic mask from value_states
+        dt_states = self.dt_proj(
+            value_states.transpose(1, 2).reshape(value_states.shape[0], value_states.shape[-2], -1)
+        )
+        dt_states = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dt_states=dt_states,
+            keep_window_size=self.keep_window_size,
+            attention_mask=attention_mask,
+        )
+        attn_mask = repeat_kv(attn_mask, self.num_key_value_groups)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=attn_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+    def prepare_dynamic_mask(
+        self,
+        hidden_states: torch.Tensor,
+        dt_states: torch.Tensor,
+        keep_window_size: int = 2048,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.
+
+        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.
+
+        Args:
+            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
+            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
+            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
+            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
+        """
+        min_dtype = torch.finfo(hidden_states.dtype).min
+        dtype = hidden_states.dtype
+        attn_mask = dt_states[:, :, None, :].expand(
+            -1, -1, hidden_states.shape[1], -1
+        )  # [batch_size, num_heads, query_len, key_len]
+        if attention_mask is not None and not isinstance(attention_mask, BlockMask):
+            if attention_mask.dtype == torch.bool:
+                dtype = hidden_states.dtype
+                attention_mask = torch.where(
+                    attention_mask, torch.tensor(0.0, device=attention_mask.device, dtype=dtype), min_dtype
+                )
+            attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : attn_mask.shape[-1]] != 0, min_dtype)
+        if attn_mask.shape[-1] > keep_window_size:
+            active_mask = torch.zeros_like(attn_mask, dtype=dtype, device=attn_mask.device)
+            topk_indices = torch.topk(attn_mask, keep_window_size, dim=-1, largest=True, sorted=False).indices
+            active_mask = active_mask.scatter(-1, topk_indices, 1.0)
+            attn_mask = attn_mask.masked_fill(active_mask == 0.0, min_dtype)
+        return attn_mask
+
+
+class DogeMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class DogeCDMoE(nn.Module):
+    def __init__(self, config: DogeConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        self.num_experts = config.num_experts
+        self.num_keys = math.floor(math.sqrt(self.num_experts))
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        # shared expert
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+
+        # router gate for retrieval experts
+        self.router_gate = nn.Linear(self.hidden_size, self.num_keys * 2, bias=False)
+
+        # routed experts
+        self.down_embed = nn.Embedding(self.num_experts, self.hidden_size)
+        self.up_embed = nn.Embedding(self.num_experts, self.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = hidden_states.shape
+
+        # get routing logits with router gate
+        router_logits = self.router_gate(hidden_states).view(2, bsz * seq_len, -1)
+
+        # get experts with the highest routing logits
+        (scores_x, scores_y), (indices_x, indices_y) = router_logits.topk(self.num_keys, dim=-1)
+        all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2)
+        all_indices = indices_x.unsqueeze(-1) * self.num_keys + indices_y.unsqueeze(-2)
+        all_scores = all_scores.view(*all_scores.shape[:-2], -1)
+        all_indices = all_indices.view(*all_indices.shape[:-2], -1)
+        scores, position_indices = all_scores.topk(self.top_k, dim=-1)
+        indices = all_indices.gather(-1, position_indices)
+        routing_weights = F.softmax(scores, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        # mix routed experts states with shared expert states
+        down_embed = self.down_embed(indices)
+        up_embed = self.up_embed(indices)
+        experts_weights = torch.matmul(down_embed, hidden_states.view(bsz * seq_len, -1, 1)).view(bsz * seq_len, -1)
+        experts_weights = self.act_fn(experts_weights) * routing_weights
+        experts_states = torch.matmul(experts_weights.view(bsz * seq_len, 1, -1), up_embed).view(bsz, seq_len, -1)
+        hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+        hidden_states = hidden_states + experts_states
+        return hidden_states, router_logits
+
+
+class DogeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_dropout = config.hidden_dropout
+
+        self.input_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = DogeAttention(config=config, layer_idx=layer_idx)
+        self.input_residual = nn.Parameter(torch.ones(config.hidden_size))
+
+        self.post_attention_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = DogeMLP(config) if not config.is_moe else DogeCDMoE(config)
+        self.post_attention_residual = nn.Parameter(torch.ones(config.hidden_size))
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        # sequence transformation
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.input_residual * residual + hidden_states
+
+        # state transformation
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.post_attention_residual * residual + hidden_states
+
+        return hidden_states
+
+
+@auto_docstring
+class DogePreTrainedModel(PreTrainedModel):
+    config: DogeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DogeDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = False
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(DogeCDMoE, index=1),
+        "hidden_states": DogeDecoderLayer,
+        "attentions": DogeAttention,
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        super()._init_weights(module)
+        if isinstance(module, DogeAttention):
+            if hasattr(module, "A"):
+                module.A.data.zero_()
+        elif isinstance(module, DogeDecoderLayer):
+            if hasattr(module, "input_residual"):
+                module.input_residual.data.fill_(1.0)
+            if hasattr(module, "post_attention_residual"):
+                module.post_attention_residual.data.fill_(1.0)
+
+
+@auto_docstring
+class DogeModel(DogePreTrainedModel):
+    def __init__(self, config: DogeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DogeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DogeRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask
+        causal_mask = mask_function(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return MoeModelOutputWithPast(  # only diff with Mistral is the output type, we need MoE
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    num_keys: Optional[int] = None,
+    top_k: int = 2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [2, batch_size * sequence_length, num_keys].
+        num_experts:
+            Number of experts
+        num_keys:
+            Number of keys
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    compute_dtype = gate_logits[0].dtype
+    compute_device = gate_logits[0].device
+    all_expert_indices = []
+    all_routing_weights = []
+
+    for layer_gate_logits in gate_logits:
+        layer_gate_logits = layer_gate_logits.to(compute_device)
+
+        (scores_x, scores_y), (indices_x, indices_y) = layer_gate_logits.topk(num_keys, dim=-1)
+
+        all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2)
+        all_indices = indices_x.unsqueeze(-1) * num_keys + indices_y.unsqueeze(-2)
+        all_scores = all_scores.view(*all_scores.shape[:-2], -1)
+        all_indices = all_indices.view(*all_indices.shape[:-2], -1)
+
+        _, position_indices = all_scores.topk(top_k, dim=-1)
+        expert_indices = all_indices.gather(-1, position_indices)
+
+        routing_weights = F.softmax(all_scores, dim=-1)
+
+        all_expert_indices.append(expert_indices)
+        all_routing_weights.append(routing_weights)
+    all_expert_indices = torch.cat(all_expert_indices, dim=0)
+    all_routing_weights = torch.cat(all_routing_weights, dim=0)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        all_expert_indices = all_expert_indices.view(-1)
+        tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device)
+        pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device)
+        tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / all_expert_indices.shape[0]
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(all_routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = len(gate_logits)
+
+        #  Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k))
+            .reshape(-1)
+            .to(compute_device)
+        )
+        all_expert_indices = all_expert_indices.view(-1)[expert_attention_mask.bool()]
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device)
+        pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device)
+        tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / torch.sum(
+            expert_attention_mask
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(all_routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert)
+    return overall_loss * num_experts
+
+
+@auto_docstring
+class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DogeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        output_router_logits: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DogeForCausalLM
+
+        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                math.floor(math.sqrt(self.num_experts)),
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+class DogeForSequenceClassification(GenericForSequenceClassification, DogePreTrainedModel):
+    pass
+
+
+__all__ = ["DogeForCausalLM", "DogeModel", "DogePreTrainedModel", "DogeForSequenceClassification"]
diff --git a/phivenv/Lib/site-packages/transformers/models/doge/modular_doge.py b/phivenv/Lib/site-packages/transformers/models/doge/modular_doge.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdee6e09c465894255eaa8cb81eaf527d68bd64f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/doge/modular_doge.py
@@ -0,0 +1,800 @@
+# coding=utf-8
+# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
+#
+# The Doge family of small language models is trained by SmallDoge Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Doge model."""
+
+import math
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...configuration_utils import PretrainedConfig
+from ...integrations.flex_attention import compile_friendly_flex_attention
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import AttentionInterface, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, is_torch_flex_attn_available
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder
+from ..llama.modeling_llama import (
+    LlamaForSequenceClassification,
+    LlamaMLP,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+    repeat_kv,
+)
+from ..mixtral.modeling_mixtral import MixtralForCausalLM, MixtralModel
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+
+class DogeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
+    model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32768):
+            Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for each sequence transformation and state transformation module.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
+            Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
+                    In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'.
+                    The original max position embeddings used during pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation.
+                    If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention.
+            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
+            For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
+            If it is not specified, will default to `num_attention_heads`.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        keep_window_size (`int`, *optional*, defaults to 2048):
+            The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
+        is_moe (`bool`, *optional*, defaults to `False`):
+            Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
+        num_experts (`int`, *optional*, defaults to 16384):
+            Number of routed experts in the model. This is only used when `is_moe=True`.
+        num_experts_per_tok (`int`, *optional*, defaults to 64):
+            Number of selected experts to route per-token.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import DogeConfig, DogeModel
+
+    >>> # Initializing a Doge-320M style configuration
+    >>> configuration = DogeConfig()
+
+    >>> # Initializing a model from the Doge-320M style configuration
+    >>> model = DogeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "doge"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `DogeModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.dt_proj": "rowwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.input_layernorm.weight": "sequence_parallel",
+        "layers.*.input_residual.weight": "sequence_parallel",
+        "layers.*.post_attention_layernorm.weight": "sequence_parallel",
+        "layers.*.post_attention_residual.weight": "sequence_parallel",
+        "norm.weight": "sequence_parallel",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+        "layers.*.mlp.router_gate": "colwise_rep",
+        "layers.*.mlp.down_embed": "rowwise_rep",
+        "layers.*.mlp.up_embed": "rowwise_rep",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32768,
+        hidden_size=1024,
+        intermediate_size=2048,
+        num_hidden_layers=32,
+        hidden_dropout=0.0,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        tie_word_embeddings=False,
+        max_position_embeddings=2048,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        num_attention_heads=8,
+        num_key_value_heads=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        sliding_window=None,
+        keep_window_size=2048,
+        is_moe=False,
+        num_experts=16384,
+        num_experts_per_tok=64,
+        norm_topk_prob=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+
+        self.hidden_dropout = hidden_dropout
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.sliding_window = sliding_window
+        self.keep_window_size = keep_window_size
+        self.is_moe = is_moe
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            self.num_key_value_heads = num_attention_heads
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DogeRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class DogeRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+def flex_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Union[torch.Tensor, "BlockMask"],
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    block_mask = None
+    causal_mask = None
+    if isinstance(attention_mask, BlockMask):
+        block_mask = attention_mask
+    else:
+        causal_mask = attention_mask
+
+    if causal_mask is not None:
+        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+
+    def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
+        if softcap is not None:
+            score = softcap * torch.tanh(score / softcap)
+        if causal_mask is not None:
+            score = score + causal_mask[batch_idx][head_idx][q_idx][kv_idx]
+        if head_mask is not None:
+            score = score + head_mask[batch_idx][head_idx][0][0]
+        return score
+
+    attn_output, attention_weights = compile_friendly_flex_attention(
+        query,
+        key,
+        value,
+        score_mod=score_mod,
+        block_mask=block_mask,
+        enable_gqa=True,
+        scale=scaling,
+        # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless.
+        # For simplification, we thus always return it as no additional computations are introduced.
+        return_lse=True,
+    )
+    # lse is returned in float32
+    attention_weights = attention_weights.to(value.dtype)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attention_weights
+
+
+ALL_ATTENTION_FUNCTIONS = AttentionInterface()
+ALL_ATTENTION_FUNCTIONS["doge_flex_attention"] = flex_attention_forward
+
+
+class DogeAttention(nn.Module):
+    def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.keep_window_size = config.keep_window_size
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        # dynamic mask for the QK^T attention weights matrix
+        self.A = nn.Parameter(torch.zeros(config.num_key_value_heads))
+        self.dt_proj = nn.Linear(
+            config.num_key_value_heads * self.head_dim, config.num_key_value_heads, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # calculate dynamic mask from value_states
+        dt_states = self.dt_proj(
+            value_states.transpose(1, 2).reshape(value_states.shape[0], value_states.shape[-2], -1)
+        )
+        dt_states = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dt_states=dt_states,
+            keep_window_size=self.keep_window_size,
+            attention_mask=attention_mask,
+        )
+        attn_mask = repeat_kv(attn_mask, self.num_key_value_groups)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=attn_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+    def prepare_dynamic_mask(
+        self,
+        hidden_states: torch.Tensor,
+        dt_states: torch.Tensor,
+        keep_window_size: int = 2048,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.
+
+        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.
+
+        Args:
+            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
+            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
+            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
+            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
+        """
+        min_dtype = torch.finfo(hidden_states.dtype).min
+        dtype = hidden_states.dtype
+        attn_mask = dt_states[:, :, None, :].expand(
+            -1, -1, hidden_states.shape[1], -1
+        )  # [batch_size, num_heads, query_len, key_len]
+        if attention_mask is not None and not isinstance(attention_mask, BlockMask):
+            if attention_mask.dtype == torch.bool:
+                dtype = hidden_states.dtype
+                attention_mask = torch.where(
+                    attention_mask, torch.tensor(0.0, device=attention_mask.device, dtype=dtype), min_dtype
+                )
+            attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : attn_mask.shape[-1]] != 0, min_dtype)
+        if attn_mask.shape[-1] > keep_window_size:
+            active_mask = torch.zeros_like(attn_mask, dtype=dtype, device=attn_mask.device)
+            topk_indices = torch.topk(attn_mask, keep_window_size, dim=-1, largest=True, sorted=False).indices
+            active_mask = active_mask.scatter(-1, topk_indices, 1.0)
+            attn_mask = attn_mask.masked_fill(active_mask == 0.0, min_dtype)
+        return attn_mask
+
+
+class DogeMLP(LlamaMLP):
+    pass
+
+
+class DogeCDMoE(nn.Module):
+    def __init__(self, config: DogeConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        self.num_experts = config.num_experts
+        self.num_keys = math.floor(math.sqrt(self.num_experts))
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        # shared expert
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+
+        # router gate for retrieval experts
+        self.router_gate = nn.Linear(self.hidden_size, self.num_keys * 2, bias=False)
+
+        # routed experts
+        self.down_embed = nn.Embedding(self.num_experts, self.hidden_size)
+        self.up_embed = nn.Embedding(self.num_experts, self.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = hidden_states.shape
+
+        # get routing logits with router gate
+        router_logits = self.router_gate(hidden_states).view(2, bsz * seq_len, -1)
+
+        # get experts with the highest routing logits
+        (scores_x, scores_y), (indices_x, indices_y) = router_logits.topk(self.num_keys, dim=-1)
+        all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2)
+        all_indices = indices_x.unsqueeze(-1) * self.num_keys + indices_y.unsqueeze(-2)
+        all_scores = all_scores.view(*all_scores.shape[:-2], -1)
+        all_indices = all_indices.view(*all_indices.shape[:-2], -1)
+        scores, position_indices = all_scores.topk(self.top_k, dim=-1)
+        indices = all_indices.gather(-1, position_indices)
+        routing_weights = F.softmax(scores, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        # mix routed experts states with shared expert states
+        down_embed = self.down_embed(indices)
+        up_embed = self.up_embed(indices)
+        experts_weights = torch.matmul(down_embed, hidden_states.view(bsz * seq_len, -1, 1)).view(bsz * seq_len, -1)
+        experts_weights = self.act_fn(experts_weights) * routing_weights
+        experts_states = torch.matmul(experts_weights.view(bsz * seq_len, 1, -1), up_embed).view(bsz, seq_len, -1)
+        hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+        hidden_states = hidden_states + experts_states
+        return hidden_states, router_logits
+
+
+class DogeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_dropout = config.hidden_dropout
+
+        self.input_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = DogeAttention(config=config, layer_idx=layer_idx)
+        self.input_residual = nn.Parameter(torch.ones(config.hidden_size))
+
+        self.post_attention_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = DogeMLP(config) if not config.is_moe else DogeCDMoE(config)
+        self.post_attention_residual = nn.Parameter(torch.ones(config.hidden_size))
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        # sequence transformation
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.input_residual * residual + hidden_states
+
+        # state transformation
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.post_attention_residual * residual + hidden_states
+
+        return hidden_states
+
+
+class DogePreTrainedModel(LlamaPreTrainedModel):
+    _supports_flash_attn = False
+    _can_compile_fullgraph = False
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(DogeCDMoE, index=1),
+        "hidden_states": DogeDecoderLayer,
+        "attentions": DogeAttention,
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, DogeAttention):
+            if hasattr(module, "A"):
+                module.A.data.zero_()
+        elif isinstance(module, DogeDecoderLayer):
+            if hasattr(module, "input_residual"):
+                module.input_residual.data.fill_(1.0)
+            if hasattr(module, "post_attention_residual"):
+                module.post_attention_residual.data.fill_(1.0)
+
+
+class DogeModel(MixtralModel):
+    pass
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    num_keys: Optional[int] = None,
+    top_k: int = 2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [2, batch_size * sequence_length, num_keys].
+        num_experts:
+            Number of experts
+        num_keys:
+            Number of keys
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    compute_dtype = gate_logits[0].dtype
+    compute_device = gate_logits[0].device
+    all_expert_indices = []
+    all_routing_weights = []
+
+    for layer_gate_logits in gate_logits:
+        layer_gate_logits = layer_gate_logits.to(compute_device)
+
+        (scores_x, scores_y), (indices_x, indices_y) = layer_gate_logits.topk(num_keys, dim=-1)
+
+        all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2)
+        all_indices = indices_x.unsqueeze(-1) * num_keys + indices_y.unsqueeze(-2)
+        all_scores = all_scores.view(*all_scores.shape[:-2], -1)
+        all_indices = all_indices.view(*all_indices.shape[:-2], -1)
+
+        _, position_indices = all_scores.topk(top_k, dim=-1)
+        expert_indices = all_indices.gather(-1, position_indices)
+
+        routing_weights = F.softmax(all_scores, dim=-1)
+
+        all_expert_indices.append(expert_indices)
+        all_routing_weights.append(routing_weights)
+    all_expert_indices = torch.cat(all_expert_indices, dim=0)
+    all_routing_weights = torch.cat(all_routing_weights, dim=0)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        all_expert_indices = all_expert_indices.view(-1)
+        tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device)
+        pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device)
+        tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / all_expert_indices.shape[0]
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(all_routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = len(gate_logits)
+
+        #  Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k))
+            .reshape(-1)
+            .to(compute_device)
+        )
+        all_expert_indices = all_expert_indices.view(-1)[expert_attention_mask.bool()]
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device)
+        pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device)
+        tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / torch.sum(
+            expert_attention_mask
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(all_routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert)
+    return overall_loss * num_experts
+
+
+class DogeForCausalLM(MixtralForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DogeModel(config)
+        self.num_experts = config.num_experts
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        output_router_logits: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DogeForCausalLM
+
+        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                math.floor(math.sqrt(self.num_experts)),
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+class DogeForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "DogeConfig",
+    "DogeForCausalLM",
+    "DogeModel",
+    "DogePreTrainedModel",
+    "DogeForSequenceClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__init__.py b/phivenv/Lib/site-packages/transformers/models/donut/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..834c451f78fa0d4c5fe91f59719b6505c4c4e4e5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/donut/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_donut_swin import *
+    from .feature_extraction_donut import *
+    from .image_processing_donut import *
+    from .image_processing_donut_fast import *
+    from .modeling_donut_swin import *
+    from .processing_donut import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..630a7b1c1937d3082bef256248f7391ba7c25477
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04fd9e0820b04a93de960e646c35c141174ed5f8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67beab55d6507fd54113d36492c892a5339c3a41
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5e9ad823b60859aaaeb005f369d116de9bd4648
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f5332b84065980ffc4d440fa480ba9cb07a82fe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3db62d03acbb76b5cc21ffe1fb6a6c08ba6f074f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..837c7ae7c8ca5caa453204c8f5ecdff70a02b344
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/configuration_donut_swin.py b/phivenv/Lib/site-packages/transformers/models/donut/configuration_donut_swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aac07dace7688273be0bdc57da0a12663c2fb5b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/donut/configuration_donut_swin.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Donut Swin Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DonutSwinConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DonutSwinModel`]. It is used to instantiate a
+    Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Donut
+    [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 7):
+            Size of windows.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to add absolute position embeddings to the patch embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import DonutSwinConfig, DonutSwinModel
+
+    >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
+    >>> configuration = DonutSwinConfig()
+
+    >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> model = DonutSwinModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "donut-swin"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
+
+
+__all__ = ["DonutSwinConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/feature_extraction_donut.py b/phivenv/Lib/site-packages/transformers/models/donut/feature_extraction_donut.py
new file mode 100644
index 0000000000000000000000000000000000000000..e37a58ddd3055e040c6c29cbd5f5cc3c34270cbe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/donut/feature_extraction_donut.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Donut."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_donut import DonutImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class DonutFeatureExtractor(DonutImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class DonutFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use DonutImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["DonutFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/image_processing_donut.py b/phivenv/Lib/site-packages/transformers/models/donut/image_processing_donut.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8a48dcabef664abf04d6f66cb4285a4da8c2620
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/donut/image_processing_donut.py
@@ -0,0 +1,477 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Donut."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    pad,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+from ...utils.import_utils import is_vision_available, requires
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+@requires(backends=("vision",))
+class DonutImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Donut image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_thumbnail (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image using thumbnail method.
+        do_align_long_axis (`bool`, *optional*, defaults to `False`):
+            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
+            random amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+            padded to the largest image size in the batch.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_thumbnail: bool = True,
+        do_align_long_axis: bool = False,
+        do_pad: bool = True,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        size = size if size is not None else {"height": 2560, "width": 1920}
+        if isinstance(size, (tuple, list)):
+            # The previous feature extractor size parameter was in (width, height) format
+            size = size[::-1]
+        size = get_size_dict(size)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_thumbnail = do_thumbnail
+        self.do_align_long_axis = do_align_long_axis
+        self.do_pad = do_pad
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def align_long_axis(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Align the long axis of the image to the longest axis of the specified size.
+
+        Args:
+            image (`np.ndarray`):
+                The image to be aligned.
+            size (`dict[str, int]`):
+                The size `{"height": h, "width": w}` to align the long axis to.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+
+        Returns:
+            `np.ndarray`: The aligned image.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = size["height"], size["width"]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(image)
+
+        if input_data_format == ChannelDimension.LAST:
+            rot_axes = (0, 1)
+        elif input_data_format == ChannelDimension.FIRST:
+            rot_axes = (1, 2)
+        else:
+            raise ValueError(f"Unsupported data format: {input_data_format}")
+
+        if (output_width < output_height and input_width > input_height) or (
+            output_width > output_height and input_width < input_height
+        ):
+            image = np.rot90(image, 3, axes=rot_axes)
+
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+        return image
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        random_padding: bool = False,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad the image to the specified size.
+
+        Args:
+            image (`np.ndarray`):
+                The image to be padded.
+            size (`dict[str, int]`):
+                The size `{"height": h, "width": w}` to pad the image to.
+            random_padding (`bool`, *optional*, defaults to `False`):
+                Whether to use random padding or not.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        output_height, output_width = size["height"], size["width"]
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+
+        delta_width = output_width - input_width
+        delta_height = output_height - input_height
+
+        if random_padding:
+            pad_top = np.random.randint(low=0, high=delta_height + 1)
+            pad_left = np.random.randint(low=0, high=delta_width + 1)
+        else:
+            pad_top = delta_height // 2
+            pad_left = delta_width // 2
+
+        pad_bottom = delta_height - pad_top
+        pad_right = delta_width - pad_left
+
+        padding = ((pad_top, pad_bottom), (pad_left, pad_right))
+        return pad(image, padding, data_format=data_format, input_data_format=input_data_format)
+
+    def pad(self, *args, **kwargs):
+        logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.")
+        return self.pad_image(*args, **kwargs)
+
+    def thumbnail(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
+        corresponding dimension of the specified size.
+
+        Args:
+            image (`np.ndarray`):
+                The image to be resized.
+            size (`dict[str, int]`):
+                The size `{"height": h, "width": w}` to resize the image to.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                The resampling filter to use.
+            data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = size["height"], size["width"]
+
+        # We always resize to the smallest of either the input or output size.
+        height = min(input_height, output_height)
+        width = min(input_width, output_width)
+
+        if height == input_height and width == input_width:
+            return image
+
+        if input_height > input_width:
+            width = int(input_width * height / input_height)
+        elif input_width > input_height:
+            height = int(input_height * width / input_width)
+
+        return resize(
+            image,
+            size=(height, width),
+            resample=resample,
+            reducing_gap=2.0,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resizes `image` to `(height, width)` specified by `size` using the PIL library.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size)
+        shortest_edge = min(size["height"], size["width"])
+        output_size = get_resize_output_image_size(
+            image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
+        )
+        resized_image = resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return resized_image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_thumbnail: Optional[bool] = None,
+        do_align_long_axis: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        random_padding: bool = False,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to min(size["height"],
+                size["width"]) with the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
+                Whether to resize the image using thumbnail method.
+            do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
+                Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
+                amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+                padded to the largest image size in the batch.
+            random_padding (`bool`, *optional*, defaults to `self.random_padding`):
+                Whether to use random padding when padding the image. If `True`, each image in the batch with be padded
+                with a random amount of padding on each side up to the size of the largest image in the batch.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image pixel values.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        if isinstance(size, (tuple, list)):
+            # Previous feature extractor had size in (width, height) format
+            size = size[::-1]
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_thumbnail = do_thumbnail if do_thumbnail is not None else self.do_thumbnail
+        do_align_long_axis = do_align_long_axis if do_align_long_axis is not None else self.do_align_long_axis
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_pad=do_pad,
+            size_divisibility=size,  # There is no pad divisibility in this processor, but pad requires the size arg.
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_align_long_axis:
+            images = [self.align_long_axis(image, size=size, input_data_format=input_data_format) for image in images]
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_thumbnail:
+            images = [self.thumbnail(image=image, size=size, input_data_format=input_data_format) for image in images]
+
+        if do_pad:
+            images = [
+                self.pad_image(
+                    image=image, size=size, random_padding=random_padding, input_data_format=input_data_format
+                )
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["DonutImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/image_processing_donut_fast.py b/phivenv/Lib/site-packages/transformers/models/donut/image_processing_donut_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec0235544170b941a63930816370c1f7934cf48
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/donut/image_processing_donut_fast.py
@@ -0,0 +1,265 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Donut."""
+
+from typing import Optional, Union
+
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_transforms import group_images_by_shape, reorder_images
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class DonutFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    Args:
+        do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
+            Whether to resize the image using thumbnail method.
+        do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
+            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+        do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+            Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
+            amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+            padded to the largest image size in the batch.
+    """
+
+    do_thumbnail: Optional[bool]
+    do_align_long_axis: Optional[bool]
+    do_pad: Optional[bool]
+
+
+@auto_docstring
+class DonutImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 2560, "width": 1920}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_thumbnail = True
+    do_align_long_axis = False
+    do_pad = True
+    valid_kwargs = DonutFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]):
+        size = kwargs.pop("size", None)
+        if isinstance(size, (tuple, list)):
+            size = size[::-1]
+        kwargs["size"] = size
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutFastImageProcessorKwargs]) -> BatchFeature:
+        if "size" in kwargs:
+            size = kwargs.pop("size")
+            if isinstance(size, (tuple, list)):
+                size = size[::-1]
+            kwargs["size"] = size
+        return super().preprocess(images, **kwargs)
+
+    def align_long_axis(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+    ) -> "torch.Tensor":
+        """
+        Align the long axis of the image to the longest axis of the specified size.
+
+        Args:
+            image (`torch.Tensor`):
+                The image to be aligned.
+            size (`dict[str, int]`):
+                The size `{"height": h, "width": w}` to align the long axis to.
+
+        Returns:
+            `torch.Tensor`: The aligned image.
+        """
+        input_height, input_width = image.shape[-2:]
+        output_height, output_width = size.height, size.width
+
+        if (output_width < output_height and input_width > input_height) or (
+            output_width > output_height and input_width < input_height
+        ):
+            height_dim, width_dim = image.dim() - 2, image.dim() - 1
+            image = torch.rot90(image, 3, dims=[height_dim, width_dim])
+
+        return image
+
+    def pad_image(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        random_padding: bool = False,
+    ) -> "torch.Tensor":
+        """
+        Pad the image to the specified size.
+
+        Args:
+            image (`torch.Tensor`):
+                The image to be padded.
+            size (`dict[str, int]`):
+                The size `{"height": h, "width": w}` to pad the image to.
+            random_padding (`bool`, *optional*, defaults to `False`):
+                Whether to use random padding or not.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        output_height, output_width = size.height, size.width
+        input_height, input_width = image.shape[-2:]
+
+        delta_width = output_width - input_width
+        delta_height = output_height - input_height
+
+        if random_padding:
+            pad_top = torch.random.randint(low=0, high=delta_height + 1)
+            pad_left = torch.random.randint(low=0, high=delta_width + 1)
+        else:
+            pad_top = delta_height // 2
+            pad_left = delta_width // 2
+
+        pad_bottom = delta_height - pad_top
+        pad_right = delta_width - pad_left
+
+        padding = (pad_left, pad_top, pad_right, pad_bottom)
+        return F.pad(image, padding)
+
+    def pad(self, *args, **kwargs):
+        logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.")
+        return self.pad_image(*args, **kwargs)
+
+    def thumbnail(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+    ) -> "torch.Tensor":
+        """
+        Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
+        corresponding dimension of the specified size.
+
+        Args:
+            image (`torch.Tensor`):
+                The image to be resized.
+            size (`dict[str, int]`):
+                The size `{"height": h, "width": w}` to resize the image to.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                The resampling filter to use.
+            data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        input_height, input_width = image.shape[-2:]
+        output_height, output_width = size.height, size.width
+
+        # We always resize to the smallest of either the input or output size.
+        height = min(input_height, output_height)
+        width = min(input_width, output_width)
+
+        if height == input_height and width == input_width:
+            return image
+
+        if input_height > input_width:
+            width = int(input_width * height / input_height)
+        elif input_width > input_height:
+            height = int(input_height * width / input_width)
+
+        return self.resize(
+            image,
+            size=SizeDict(width=width, height=height),
+            interpolation=F.InterpolationMode.BICUBIC,
+        )
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        do_thumbnail: bool,
+        do_align_long_axis: bool,
+        do_pad: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_align_long_axis:
+                stacked_images = self.align_long_axis(image=stacked_images, size=size)
+            if do_resize:
+                shortest_edge = min(size.height, size.width)
+                stacked_images = self.resize(
+                    image=stacked_images, size=SizeDict(shortest_edge=shortest_edge), interpolation=interpolation
+                )
+            if do_thumbnail:
+                stacked_images = self.thumbnail(image=stacked_images, size=size)
+            if do_pad:
+                stacked_images = self.pad_image(image=stacked_images, size=size, random_padding=False)
+
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["DonutImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/modeling_donut_swin.py b/phivenv/Lib/site-packages/transformers/models/donut/modeling_donut_swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3126e88f251a7cba81da633dddd6974a2e01bd4a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/donut/modeling_donut_swin.py
@@ -0,0 +1,1032 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Donut Swin Transformer model.
+
+This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
+states."""
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from .configuration_donut_swin import DonutSwinConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    DonutSwin encoder's outputs, with potential hidden states and attentions.
+    """
+)
+# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin
+class DonutSwinEncoderOutput(ModelOutput):
+    r"""
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    DonutSwin model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin
+class DonutSwinModelOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+        Average pooling of the last layer hidden-state.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    DonutSwin outputs for image classification.
+    """
+)
+# Copied from transformers.models.swin.modeling_swin.SwinImageClassifierOutput with Swin->DonutSwin
+class DonutSwinImageClassifierOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Classification (or regression if config.num_labels==1) loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Classification (or regression if config.num_labels==1) scores (before SoftMax).
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+# Copied from transformers.models.swin.modeling_swin.window_partition
+def window_partition(input_feature, window_size):
+    """
+    Partitions the given input into windows.
+    """
+    batch_size, height, width, num_channels = input_feature.shape
+    input_feature = input_feature.view(
+        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+    )
+    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.window_reverse
+def window_reverse(windows, window_size, height, width):
+    """
+    Merges windows to produce higher resolution features.
+    """
+    num_channels = windows.shape[-1]
+    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
+    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->DonutSwin
+class DonutSwinEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config, use_mask_token=False):
+        super().__init__()
+
+        self.patch_embeddings = DonutSwinPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.patch_grid = self.patch_embeddings.grid_size
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+
+        if config.use_absolute_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
+        else:
+            self.position_embeddings = None
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor],
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> tuple[torch.Tensor]:
+        _, num_channels, height, width = pixel_values.shape
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        if self.position_embeddings is not None:
+            if interpolate_pos_encoding:
+                embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+            else:
+                embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->DonutSwin
+class DonutSwinPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def maybe_pad(self, pixel_values, height, width):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor, tuple[int]]:
+        _, num_channels, height, width = pixel_values.shape
+        # pad the input to be divisible by self.patch_size, if needed
+        pixel_values = self.maybe_pad(pixel_values, height, width)
+        embeddings = self.projection(pixel_values)
+        _, _, height, width = embeddings.shape
+        output_dimensions = (height, width)
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging
+class DonutSwinPatchMerging(nn.Module):
+    """
+    Patch Merging Layer.
+
+    Args:
+        input_resolution (`tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution: tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+
+        return input_feature
+
+    def forward(self, input_feature: torch.Tensor, input_dimensions: tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be divisible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinDropPath
+class DonutSwinDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->DonutSwin
+class DonutSwinSelfAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.window_size = (
+            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+        )
+
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, dim, num_channels = hidden_states.shape
+        hidden_shape = (batch_size, dim, -1, self.attention_head_size)
+
+        query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_layer = self.key(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )
+
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in DonutSwinModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.view(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput
+class DonutSwinSelfOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->DonutSwin
+class DonutSwinAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        self.self = DonutSwinSelfAttention(config, dim, num_heads, window_size)
+        self.output = DonutSwinSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinIntermediate
+class DonutSwinIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinOutput
+class DonutSwinOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin
+class DonutSwinLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size)
+        self.drop_path = DonutSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = DonutSwinIntermediate(config, dim)
+        self.output = DonutSwinOutput(config, dim)
+
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = torch_int(0)
+            self.window_size = (
+                torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution)
+            )
+
+    def get_attn_mask(self, height, width, dtype, device):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device=device)
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0).masked_fill(attn_mask == 0, 0.0)
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
+
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_hidden_states = hidden_states
+
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask(
+            height_pad, width_pad, dtype=hidden_states.dtype, device=hidden_states_windows.device
+        )
+
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->DonutSwin
+class DonutSwinStage(GradientCheckpointingLayer):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                DonutSwinLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    drop_path_rate=drop_path[i],
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->DonutSwin
+class DonutSwinEncoder(nn.Module):
+    def __init__(self, config, grid_size):
+        super().__init__()
+        self.num_layers = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+        self.layers = nn.ModuleList(
+            [
+                DonutSwinStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, DonutSwinEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+
+            hidden_states = layer_outputs[0]
+            hidden_states_before_downsampling = layer_outputs[1]
+            output_dimensions = layer_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange b (h w) c -> b c h w
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[3:]
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return DonutSwinEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->DonutSwin,swin->donut
+class DonutSwinPreTrainedModel(PreTrainedModel):
+    config: DonutSwinConfig
+    base_model_prefix = "donut"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DonutSwinStage"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, DonutSwinEmbeddings):
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, DonutSwinSelfAttention):
+            module.relative_position_bias_table.data.zero_()
+
+
+@auto_docstring
+class DonutSwinModel(DonutSwinPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        """
+        super().__init__(config)
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+
+        self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)
+
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DonutSwinModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+
+        embedding_output, input_dimensions = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return DonutSwinModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    DonutSwin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune DonutSwin on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """
+)
+# Copied from transformers.models.swin.modeling_swin.SwinForImageClassification with Swin->DonutSwin,swin->donut
+class DonutSwinForImageClassification(DonutSwinPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.donut = DonutSwinModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.donut.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DonutSwinImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.donut(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=logits, config=self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DonutSwinImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+__all__ = ["DonutSwinModel", "DonutSwinPreTrainedModel", "DonutSwinForImageClassification"]
diff --git a/phivenv/Lib/site-packages/transformers/models/donut/processing_donut.py b/phivenv/Lib/site-packages/transformers/models/donut/processing_donut.py
new file mode 100644
index 0000000000000000000000000000000000000000..288ba1107dd8f4ff2fdbfe1b2d56a6d1cb0b165a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/donut/processing_donut.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Donut.
+"""
+
+import re
+import warnings
+from contextlib import contextmanager
+from typing import Optional, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+class DonutProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
+logger = logging.get_logger(__name__)
+
+
+class DonutProcessor(ProcessorMixin):
+    r"""
+    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
+    processor.
+
+    [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
+    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
+    [`~DonutProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`DonutImageProcessor`], *optional*):
+            An instance of [`DonutImageProcessor`]. The image processor is a required input.
+        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
+            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self._in_target_context_manager = False
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[DonutProcessorKwargs],
+    ):
+        """
+        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
+        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
+        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
+        [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
+        """
+        if self._in_target_context_manager:
+            return self.current_processor(images, text, **kwargs)
+
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
+
+        output_kwargs = self._merge_kwargs(
+            DonutProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if images is not None:
+            inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        if text is not None:
+            if images is not None:
+                output_kwargs["text_kwargs"].setdefault("add_special_tokens", False)
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        if text is None:
+            return inputs
+        elif images is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]  # for BC
+            inputs["input_ids"] = encodings["input_ids"]
+            return inputs
+
+    @property
+    def model_input_names(self):
+        image_processor_input_names = self.image_processor.model_input_names
+
+        return list(image_processor_input_names + ["input_ids", "labels"])
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your images inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.image_processor
+        self._in_target_context_manager = False
+
+    def token2json(self, tokens, is_inner_value=False, added_vocab=None):
+        """
+        Convert a (generated) token sequence into an ordered JSON format.
+        """
+        if added_vocab is None:
+            added_vocab = self.tokenizer.get_added_vocab()
+
+        output = {}
+
+        while tokens:
+            # We want r"<s_(.*?)>" but without ReDOS risk, so do it manually in two parts
+            potential_start = re.search(r"<s_", tokens, re.IGNORECASE)
+            if potential_start is None:
+                break
+            start_token = tokens[potential_start.start() :]
+            if ">" not in start_token:
+                break
+            start_token = start_token[: start_token.index(">") + 1]
+            key = start_token[len("<s_") : -len(">")]
+            key_escaped = re.escape(key)
+
+            end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
+            if end_token is None:
+                tokens = tokens.replace(start_token, "")
+            else:
+                end_token = end_token.group()
+                start_token_escaped = re.escape(start_token)
+                end_token_escaped = re.escape(end_token)
+                content = re.search(
+                    f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
+                )
+                if content is not None:
+                    content = content.group(1).strip()
+                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
+                        value = self.token2json(content, is_inner_value=True, added_vocab=added_vocab)
+                        if value:
+                            if len(value) == 1:
+                                value = value[0]
+                            output[key] = value
+                    else:  # leaf nodes
+                        output[key] = []
+                        for leaf in content.split(r"<sep/>"):
+                            leaf = leaf.strip()
+                            if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
+                                leaf = leaf[1:-2]  # for categorical special tokens
+                            output[key].append(leaf)
+                        if len(output[key]) == 1:
+                            output[key] = output[key][0]
+
+                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
+                if tokens[:6] == r"<sep/>":  # non-leaf nodes
+                    return [output] + self.token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)
+
+        if output:
+            return [output] if is_inner_value else output
+        else:
+            return [] if is_inner_value else {"text_sequence": tokens}
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["DonutProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/__init__.py b/phivenv/Lib/site-packages/transformers/models/dots1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..60223e4df87f1d925c94f4737c215e904a47ac36
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dots1/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dots1 import *
+    from .modeling_dots1 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa146b4d559a940d620aad3911840e08376c744a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55944ff4ad8bdf0ba480a843f9048c9985a613d8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b204341ad59c13b1610be97177bda5abe13296d0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de0cfdc22d7a55a29957364d96809e26abdcdf18
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/configuration_dots1.py b/phivenv/Lib/site-packages/transformers/models/dots1/configuration_dots1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca198e71d09e8f579e6415f375db3b96348d358e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dots1/configuration_dots1.py
@@ -0,0 +1,211 @@
+# coding=utf-8
+# Copyright 2025 The rednote-hilab team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dots1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dots1Model`]. It is used to instantiate a
+    `dots.llm1` model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    [rednote-hilab/dots.llm1.base](https://huggingface.co/rednote-hilab/dots.llm1.base).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
+            `input_ids` passed when calling [`Dots1Model`].
+        hidden_size (`int`, *optional*, defaults to 4608):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 10944):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Dimension of the MoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 62):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            Number of key/value heads for Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, Multi
+            Head Attention (MHA) is used. If `num_key_value_heads=1`, Multi Query Attention (MQA) is used. Otherwise,
+            Grouped Query Attention (GQA) is used. If not specified, defaults to `num_attention_heads`.
+        n_shared_experts (`int`, *optional*, default=None):
+            Number of shared experts. None means dense model.
+        n_routed_experts (`int`, *optional*, default=None):
+            Number of routed experts. None means dense model.
+        n_group (`int`, *optional*, defaults to 1):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 1):
+            Number of selected groups for each token (selected experts only within `topk_group` groups).
+        num_experts_per_tok (`int`, *optional*, default=None):
+            Number of selected experts. None means dense model.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
+            Number of dense layers at the beginning of the model before the first MoE layer.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the weights of the routed experts.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string).
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            Maximum sequence length the model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            Standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            Epsilon used by the RMS normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions. Only relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the input and output word embeddings.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary for scaling RoPE embeddings. Supports `{"type": strategy name, "factor": scaling factor}`.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the self-attention projections.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout ratio for the attention probabilities.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for routed experts.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Size of the sliding window for attention. If not specified, defaults to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 62):
+            The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
+            additional layer afterwards will use SWA (Sliding Window Attention).
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+
+    Examples:
+        ```python
+        >>> from transformers import Dots1Model, Dots1Config
+
+        >>> # Initializing a Dots1 style configuration
+        >>> configuration = Dots1Config()
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```
+    """
+
+    model_type = "dots1"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_tp_plan = {  # TODO: only replicate attention layers when > first_k_dense_replace
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "local_colwise",
+        "layers.*.mlp.experts.*.up_proj": "local_colwise",
+        "layers.*.mlp.experts.*.down_proj": "local_rowwise",
+        "layers.*.mlp.experts.*": "local",  # each expert is wrapped in a module list
+        "layers.*.mlp.shared_experts.gate_proj": "local_colwise",
+        "layers.*.mlp.shared_experts.up_proj": "local_colwise",
+        "layers.*.mlp.shared_experts.down_proj": "local_rowwise",
+        "layers.*.mlp.shared_experts": "local",
+        "layers.*.mlp.gate_proj": "local_colwise",
+        "layers.*.mlp.up_proj": "local_colwise",
+        "layers.*.mlp.down_proj": "local_rowwise",
+        "layers.*.mlp": "gather",  # This is the only moment where results are gathered
+    }
+
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=4608,
+        intermediate_size=10944,
+        moe_intermediate_size=1408,
+        num_hidden_layers=62,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        n_group=1,
+        topk_group=1,
+        num_experts_per_tok=None,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        routed_scaling_factor=1.0,
+        sliding_window=4096,
+        max_window_layers=62,
+        layer_types=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.routed_scaling_factor = routed_scaling_factor
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Dots1Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/modeling_dots1.py b/phivenv/Lib/site-packages/transformers/models/dots1/modeling_dots1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea500c064512ddd1813e8b34e302e38fb05cfa57
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dots1/modeling_dots1.py
@@ -0,0 +1,612 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dots1/modular_dots1.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dots1.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The rednote-hilab team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_dots1 import Dots1Config
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Dots1RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        Dots1RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Dots1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Dots1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Dots1Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Dots1Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Dots1RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Dots1RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
+        self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Dots1MLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Dots1MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList(
+            [Dots1MLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(config.n_routed_experts)]
+        )
+        self.gate = Dots1TopkRouter(config)
+        self.shared_experts = Dots1MLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+        )
+
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        r"""
+        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+        to not have to do a loop here (deepseek has 256 experts soooo yeah).
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+        # in original deepseek, the output of the experts are gathered once we leave this module
+        # thus the moe module is itelsf an IsolatedParallel module
+        # and all expert are "local" meaning we shard but we don't gather
+        return final_hidden_states.type(hidden_states.dtype)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+class Dots1TopkRouter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+        self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
+
+    @torch.no_grad()
+    def get_topk_indices(self, scores):
+        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+        group_scores = (
+            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .topk(2, dim=-1)[0]
+            .sum(dim=-1)
+        )
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1)
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .reshape(-1, self.n_routed_experts)
+        )
+        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+        return topk_indices
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+        scores = router_logits.sigmoid()
+        topk_indices = self.get_topk_indices(scores)
+        topk_weights = scores.gather(1, topk_indices)
+        if self.norm_topk_prob:
+            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weights /= denominator
+        topk_weights = topk_weights * self.routed_scaling_factor
+        return topk_indices, topk_weights
+
+
+class Dots1DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Dots1Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Dots1Attention(config=config, layer_idx=layer_idx)
+
+        if layer_idx >= config.first_k_dense_replace:
+            self.mlp = Dots1MoE(config)
+        else:
+            self.mlp = Dots1MLP(config)
+
+        self.input_layernorm = Dots1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Dots1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class Dots1PreTrainedModel(PreTrainedModel):
+    config: Dots1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Dots1DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Dots1DecoderLayer,
+        "attentions": Dots1Attention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Dots1TopkRouter):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+@auto_docstring
+class Dots1Model(Dots1PreTrainedModel):
+    def __init__(self, config: Dots1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Dots1DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Dots1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Dots1RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@auto_docstring
+class Dots1ForCausalLM(Dots1PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Dots1Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Dots1ForCausalLM
+
+        >>> model = Dots1ForCausalLM.from_pretrained("rednote-hilab/dots1.llm1.inst")
+        >>> tokenizer = AutoTokenizer.from_pretrained("rednote-hilab/dots1.llm1.inst")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["Dots1PreTrainedModel", "Dots1Model", "Dots1ForCausalLM"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dots1/modular_dots1.py b/phivenv/Lib/site-packages/transformers/models/dots1/modular_dots1.py
new file mode 100644
index 0000000000000000000000000000000000000000..345265a14080c7e8978c37b34124e208536b1da2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dots1/modular_dots1.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2025 The rednote-hilab team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..deepseek_v3.modeling_deepseek_v3 import (
+    DeepseekV3DecoderLayer,
+    DeepseekV3MLP,
+    DeepseekV3MoE,
+    DeepseekV3PreTrainedModel,
+    DeepseekV3TopkRouter,
+)
+from ..qwen3.modeling_qwen3 import (
+    Qwen3Attention,
+    Qwen3ForCausalLM,
+    Qwen3Model,
+    Qwen3RMSNorm,
+    Qwen3RotaryEmbedding,
+    TransformersKwargs,
+)
+from .configuration_dots1 import Dots1Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dots1RMSNorm(Qwen3RMSNorm):
+    pass
+
+
+class Dots1RotaryEmbedding(Qwen3RotaryEmbedding):
+    pass
+
+
+class Dots1Attention(Qwen3Attention):
+    pass
+
+
+class Dots1MLP(DeepseekV3MLP):
+    pass
+
+
+class Dots1MoE(DeepseekV3MoE):
+    pass
+
+
+class Dots1TopkRouter(DeepseekV3TopkRouter):
+    pass
+
+
+class Dots1DecoderLayer(DeepseekV3DecoderLayer):
+    def __init__(self, config: Dots1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.attention_type = config.layer_types[layer_idx]
+
+
+class Dots1PreTrainedModel(DeepseekV3PreTrainedModel):
+    pass
+
+
+class Dots1Model(Qwen3Model):
+    pass
+
+
+class Dots1ForCausalLM(Qwen3ForCausalLM):
+    def forward(
+        self,
+        **super_kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Dots1ForCausalLM
+
+        >>> model = Dots1ForCausalLM.from_pretrained("rednote-hilab/dots1.llm1.inst")
+        >>> tokenizer = AutoTokenizer.from_pretrained("rednote-hilab/dots1.llm1.inst")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        return super().forward(**super_kwargs)
+
+
+__all__ = [
+    "Dots1PreTrainedModel",
+    "Dots1Model",
+    "Dots1ForCausalLM",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/__init__.py b/phivenv/Lib/site-packages/transformers/models/dpr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aeadbeaf416575570c280a3e15a52422a007103
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpr/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dpr import *
+    from .modeling_dpr import *
+    from .modeling_tf_dpr import *
+    from .tokenization_dpr import *
+    from .tokenization_dpr_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bd635711f8c6cf8a4a46255f5ec81493323e7aa
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d925cdc14ff3cc5038212855e789493d11931278
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc938de8bd47217311bb2b40d965f65be1e0d6ab
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39994aafe820d9104ecd3fb3c8fde3b8b5212a74
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db42fcf67595094c41bf55151819fa241bf21e1c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..165a51f54d7e2c2be8ddd3bdc3762446ef8eb472
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/configuration_dpr.py b/phivenv/Lib/site-packages/transformers/models/dpr/configuration_dpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b16900249329ad867ae6b13b58b89d7722a25a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpr/configuration_dpr.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+# Copyright 2010, DPR authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DPR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DPRConfig(PretrainedConfig):
+    r"""
+    [`DPRConfig`] is the configuration class to store the configuration of a *DPRModel*.
+
+    This is the configuration class to store the configuration of a [`DPRContextEncoder`], [`DPRQuestionEncoder`], or a
+    [`DPRReader`]. It is used to instantiate the components of the DPR model according to the specified arguments,
+    defining the model component architectures. Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the DPRContextEncoder
+    [facebook/dpr-ctx_encoder-single-nq-base](https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base)
+    architecture.
+
+    This class is a subclass of [`BertConfig`]. Please check the superclass for the documentation of all kwargs.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the DPR model. Defines the different tokens that can be represented by the *inputs_ids*
+            passed to the forward method of [`BertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        projection_dim (`int`, *optional*, defaults to 0):
+            Dimension of the projection for the context and question encoders. If it is set to zero (default), then no
+            projection is done.
+
+    Example:
+
+    ```python
+    >>> from transformers import DPRConfig, DPRContextEncoder
+
+    >>> # Initializing a DPR facebook/dpr-ctx_encoder-single-nq-base style configuration
+    >>> configuration = DPRConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/dpr-ctx_encoder-single-nq-base style configuration
+    >>> model = DPRContextEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dpr"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        projection_dim: int = 0,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.projection_dim = projection_dim
+        self.position_embedding_type = position_embedding_type
+
+
+__all__ = ["DPRConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/modeling_dpr.py b/phivenv/Lib/site-packages/transformers/models/dpr/modeling_dpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ae00a02e07a0f3ee4c0ca064e7e9818568e605
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpr/modeling_dpr.py
@@ -0,0 +1,592 @@
+# coding=utf-8
+# Copyright 2018 DPR Authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DPR model for Open Domain Question Answering."""
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+
+from ...modeling_outputs import BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    logging,
+)
+from ..bert.modeling_bert import BertModel
+from .configuration_dpr import DPRConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+##########
+# Outputs
+##########
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`DPRQuestionEncoder`].
+    """
+)
+class DPRContextEncoderOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+        The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
+        hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
+        This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
+    """
+
+    pooler_output: torch.FloatTensor
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`DPRQuestionEncoder`].
+    """
+)
+class DPRQuestionEncoderOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+        The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
+        hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
+        This output is to be used to embed questions for nearest neighbors queries with context embeddings.
+    """
+
+    pooler_output: torch.FloatTensor
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`DPRQuestionEncoder`].
+    """
+)
+class DPRReaderOutput(ModelOutput):
+    r"""
+    start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
+        Logits of the start index of the span for each passage.
+    end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
+        Logits of the end index of the span for each passage.
+    relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
+        Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
+        question, compared to all the other passages.
+    """
+
+    start_logits: torch.FloatTensor
+    end_logits: Optional[torch.FloatTensor] = None
+    relevance_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@auto_docstring
+class DPRPreTrainedModel(PreTrainedModel):
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class DPREncoder(DPRPreTrainedModel):
+    base_model_prefix = "bert_model"
+
+    def __init__(self, config: DPRConfig):
+        super().__init__(config)
+        self.bert_model = BertModel(config, add_pooling_layer=False)
+        if self.bert_model.config.hidden_size <= 0:
+            raise ValueError("Encoder hidden_size can't be zero")
+        self.projection_dim = config.projection_dim
+        if self.projection_dim > 0:
+            self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+    ) -> Union[BaseModelOutputWithPooling, tuple[Tensor, ...]]:
+        outputs = self.bert_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if self.projection_dim > 0:
+            pooled_output = self.encode_proj(pooled_output)
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + outputs[2:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @property
+    def embeddings_size(self) -> int:
+        if self.projection_dim > 0:
+            return self.encode_proj.out_features
+        return self.bert_model.config.hidden_size
+
+
+class DPRSpanPredictor(DPRPreTrainedModel):
+    base_model_prefix = "encoder"
+
+    def __init__(self, config: DPRConfig):
+        super().__init__(config)
+        self.encoder = DPREncoder(config)
+        self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2)
+        self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+    ) -> Union[DPRReaderOutput, tuple[Tensor, ...]]:
+        # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length
+        n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2]
+        # feed encoder
+        outputs = self.encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        # compute logits
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        relevance_logits = self.qa_classifier(sequence_output[:, 0, :])
+
+        # resize
+        start_logits = start_logits.view(n_passages, sequence_length)
+        end_logits = end_logits.view(n_passages, sequence_length)
+        relevance_logits = relevance_logits.view(n_passages)
+
+        if not return_dict:
+            return (start_logits, end_logits, relevance_logits) + outputs[2:]
+
+        return DPRReaderOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            relevance_logits=relevance_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+##################
+# PreTrainedModel
+##################
+
+
+class DPRPretrainedContextEncoder(DPRPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: DPRConfig
+    load_tf_weights = None
+    base_model_prefix = "ctx_encoder"
+
+
+class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: DPRConfig
+    load_tf_weights = None
+    base_model_prefix = "question_encoder"
+
+
+class DPRPretrainedReader(DPRPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: DPRConfig
+    load_tf_weights = None
+    base_model_prefix = "span_predictor"
+
+
+###############
+# Actual Models
+###############
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare DPRContextEncoder transformer outputting pooler outputs as context representations.
+    """
+)
+class DPRContextEncoder(DPRPretrainedContextEncoder):
+    def __init__(self, config: DPRConfig):
+        super().__init__(config)
+        self.config = config
+        self.ctx_encoder = DPREncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[DPRContextEncoderOutput, tuple[Tensor, ...]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
+            formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs (for a pair title+text for example):
+
+            ```
+            tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+            ```
+
+            (b) For single sequences (for a question for example):
+
+            ```
+            tokens:         [CLS] the dog is hairy . [SEP]
+            token_type_ids:   0   0   0   0  0     0   0
+            ```
+
+            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+            rather than the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        Examples:
+
+        ```python
+        >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
+
+        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+        >>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = (
+                torch.ones(input_shape, device=device)
+                if input_ids is None
+                else (input_ids != self.config.pad_token_id)
+            )
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        outputs = self.ctx_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return outputs[1:]
+        return DPRContextEncoderOutput(
+            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.
+    """
+)
+class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
+    def __init__(self, config: DPRConfig):
+        super().__init__(config)
+        self.config = config
+        self.question_encoder = DPREncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[DPRQuestionEncoderOutput, tuple[Tensor, ...]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
+            formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs (for a pair title+text for example):
+
+            ```
+            tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+            ```
+
+            (b) For single sequences (for a question for example):
+
+            ```
+            tokens:         [CLS] the dog is hairy . [SEP]
+            token_type_ids:   0   0   0   0  0     0   0
+            ```
+
+            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+            rather than the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        Examples:
+
+        ```python
+        >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
+
+        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        >>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = (
+                torch.ones(input_shape, device=device)
+                if input_ids is None
+                else (input_ids != self.config.pad_token_id)
+            )
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        outputs = self.question_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return outputs[1:]
+        return DPRQuestionEncoderOutput(
+            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare DPRReader transformer outputting span predictions.
+    """
+)
+class DPRReader(DPRPretrainedReader):
+    def __init__(self, config: DPRConfig):
+        super().__init__(config)
+        self.config = config
+        self.span_predictor = DPRSpanPredictor(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[DPRReaderOutput, tuple[Tensor, ...]]:
+        r"""
+        input_ids (`tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
+            and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
+            be formatted with [CLS] and [SEP] with the format:
+
+            `[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`
+
+            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+            rather than the left.
+
+            Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        Examples:
+
+        ```python
+        >>> from transformers import DPRReader, DPRReaderTokenizer
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> encoded_inputs = tokenizer(
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="pt",
+        ... )
+        >>> outputs = model(**encoded_inputs)
+        >>> start_logits = outputs.start_logits
+        >>> end_logits = outputs.end_logits
+        >>> relevance_logits = outputs.relevance_logits
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+
+        return self.span_predictor(
+            input_ids,
+            attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+__all__ = [
+    "DPRContextEncoder",
+    "DPRPretrainedContextEncoder",
+    "DPRPreTrainedModel",
+    "DPRPretrainedQuestionEncoder",
+    "DPRPretrainedReader",
+    "DPRQuestionEncoder",
+    "DPRReader",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/modeling_tf_dpr.py b/phivenv/Lib/site-packages/transformers/models/dpr/modeling_tf_dpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef83e6c55fbe27ea57e48bf2baca515999010cb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpr/modeling_tf_dpr.py
@@ -0,0 +1,799 @@
+# coding=utf-8
+# Copyright 2018 DPR Authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TensorFlow DPR model for Open Domain Question Answering."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFBaseModelOutputWithPooling
+from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, get_initializer, keras, shape_list, unpack_inputs
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..bert.modeling_tf_bert import TFBertMainLayer
+from .configuration_dpr import DPRConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DPRConfig"
+
+
+##########
+# Outputs
+##########
+
+
+@dataclass
+class TFDPRContextEncoderOutput(ModelOutput):
+    r"""
+    Class for outputs of [`TFDPRContextEncoder`].
+
+    Args:
+        pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+            The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
+            hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
+            This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    pooler_output: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+    attentions: tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFDPRQuestionEncoderOutput(ModelOutput):
+    """
+    Class for outputs of [`TFDPRQuestionEncoder`].
+
+    Args:
+        pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+            The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
+            hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
+            This output is to be used to embed questions for nearest neighbors queries with context embeddings.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    pooler_output: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+    attentions: tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFDPRReaderOutput(ModelOutput):
+    """
+    Class for outputs of [`TFDPRReaderEncoder`].
+
+    Args:
+        start_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`):
+            Logits of the start index of the span for each passage.
+        end_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`):
+            Logits of the end index of the span for each passage.
+        relevance_logits (`tf.Tensor` of shape `(n_passages, )`):
+            Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
+            question, compared to all the other passages.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    start_logits: tf.Tensor | None = None
+    end_logits: tf.Tensor | None = None
+    relevance_logits: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+    attentions: tuple[tf.Tensor, ...] | None = None
+
+
+class TFDPREncoderLayer(keras.layers.Layer):
+    base_model_prefix = "bert_model"
+
+    def __init__(self, config: DPRConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        # resolve name conflict with TFBertMainLayer instead of TFBertModel
+        self.bert_model = TFBertMainLayer(config, add_pooling_layer=False, name="bert_model")
+        self.config = config
+
+        if self.config.hidden_size <= 0:
+            raise ValueError("Encoder hidden_size can't be zero")
+        self.projection_dim = config.projection_dim
+        if self.projection_dim > 0:
+            self.encode_proj = keras.layers.Dense(
+                config.projection_dim, kernel_initializer=get_initializer(config.initializer_range), name="encode_proj"
+            )
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor, ...]:
+        outputs = self.bert_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+        if self.projection_dim > 0:
+            pooled_output = self.encode_proj(pooled_output)
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @property
+    def embeddings_size(self) -> int:
+        if self.projection_dim > 0:
+            return self.projection_dim
+        return self.bert_model.config.hidden_size
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "bert_model", None) is not None:
+            with tf.name_scope(self.bert_model.name):
+                self.bert_model.build(None)
+        if getattr(self, "encode_proj", None) is not None:
+            with tf.name_scope(self.encode_proj.name):
+                self.encode_proj.build(None)
+
+
+class TFDPRSpanPredictorLayer(keras.layers.Layer):
+    base_model_prefix = "encoder"
+
+    def __init__(self, config: DPRConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.encoder = TFDPREncoderLayer(config, name="encoder")
+
+        self.qa_outputs = keras.layers.Dense(
+            2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.qa_classifier = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="qa_classifier"
+        )
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+        training: bool = False,
+    ) -> TFDPRReaderOutput | tuple[tf.Tensor, ...]:
+        # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length
+        n_passages, sequence_length = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)[:2]
+        # feed encoder
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        # compute logits
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+        relevance_logits = self.qa_classifier(sequence_output[:, 0, :])
+
+        # resize
+        start_logits = tf.reshape(start_logits, [n_passages, sequence_length])
+        end_logits = tf.reshape(end_logits, [n_passages, sequence_length])
+        relevance_logits = tf.reshape(relevance_logits, [n_passages])
+
+        if not return_dict:
+            return (start_logits, end_logits, relevance_logits) + outputs[2:]
+
+        return TFDPRReaderOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            relevance_logits=relevance_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.encoder.embeddings_size])
+        if getattr(self, "qa_classifier", None) is not None:
+            with tf.name_scope(self.qa_classifier.name):
+                self.qa_classifier.build([None, None, self.encoder.embeddings_size])
+
+
+class TFDPRSpanPredictor(TFPreTrainedModel):
+    base_model_prefix = "encoder"
+
+    def __init__(self, config: DPRConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.encoder = TFDPRSpanPredictorLayer(config)
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+        training: bool = False,
+    ) -> TFDPRReaderOutput | tuple[tf.Tensor, ...]:
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+
+class TFDPREncoder(TFPreTrainedModel):
+    base_model_prefix = "encoder"
+
+    def __init__(self, config: DPRConfig, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.encoder = TFDPREncoderLayer(config)
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+        training: bool = False,
+    ) -> TFDPRReaderOutput | tuple[tf.Tensor, ...]:
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+
+##################
+# PreTrainedModel
+##################
+
+
+class TFDPRPretrainedContextEncoder(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DPRConfig
+    base_model_prefix = "ctx_encoder"
+
+
+class TFDPRPretrainedQuestionEncoder(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DPRConfig
+    base_model_prefix = "question_encoder"
+
+
+class TFDPRPretrainedReader(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DPRConfig
+    base_model_prefix = "reader"
+
+
+###############
+# Actual Models
+###############
+
+
+TF_DPR_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Tensorflow [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)
+    subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to
+    general usage and behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`DPRConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
+            formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs (for a pair title+text for example):
+
+            ```
+            tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+            ```
+
+            (b) For single sequences (for a question for example):
+
+            ```
+            tokens:         [CLS] the dog is hairy . [SEP]
+            token_type_ids:   0   0   0   0  0     0   0
+            ```
+
+            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+            rather than the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+TF_DPR_READER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shapes `(n_passages, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
+            and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
+            be formatted with [CLS] and [SEP] with the format:
+
+                `[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`
+
+            DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+            rather than the left.
+
+            Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare DPRContextEncoder transformer outputting pooler outputs as context representations.",
+    TF_DPR_START_DOCSTRING,
+)
+class TFDPRContextEncoder(TFDPRPretrainedContextEncoder):
+    def __init__(self, config: DPRConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.ctx_encoder = TFDPREncoderLayer(config, name="ctx_encoder")
+
+    def get_input_embeddings(self):
+        try:
+            return self.ctx_encoder.bert_model.get_input_embeddings()
+        except AttributeError:
+            self.build()
+            return self.ctx_encoder.bert_model.get_input_embeddings()
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFDPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFDPRContextEncoderOutput | tuple[tf.Tensor, ...]:
+        r"""
+        Return:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer
+
+        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+        >>> model = TFDPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", from_pt=True)
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = (
+                tf.ones(input_shape, dtype=tf.dtypes.int32)
+                if input_ids is None
+                else (input_ids != self.config.pad_token_id)
+            )
+        if token_type_ids is None:
+            token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32)
+
+        outputs = self.ctx_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            return outputs[1:]
+
+        return TFDPRContextEncoderOutput(
+            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "ctx_encoder", None) is not None:
+            with tf.name_scope(self.ctx_encoder.name):
+                self.ctx_encoder.build(None)
+
+
+@add_start_docstrings(
+    "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
+    TF_DPR_START_DOCSTRING,
+)
+class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder):
+    def __init__(self, config: DPRConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.question_encoder = TFDPREncoderLayer(config, name="question_encoder")
+
+    def get_input_embeddings(self):
+        try:
+            return self.question_encoder.bert_model.get_input_embeddings()
+        except AttributeError:
+            self.build()
+            return self.question_encoder.bert_model.get_input_embeddings()
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFDPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFDPRQuestionEncoderOutput | tuple[tf.Tensor, ...]:
+        r"""
+        Return:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer
+
+        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        >>> model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", from_pt=True)
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = (
+                tf.ones(input_shape, dtype=tf.dtypes.int32)
+                if input_ids is None
+                else (input_ids != self.config.pad_token_id)
+            )
+        if token_type_ids is None:
+            token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32)
+
+        outputs = self.question_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            return outputs[1:]
+        return TFDPRQuestionEncoderOutput(
+            pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "question_encoder", None) is not None:
+            with tf.name_scope(self.question_encoder.name):
+                self.question_encoder.build(None)
+
+
+@add_start_docstrings(
+    "The bare DPRReader transformer outputting span predictions.",
+    TF_DPR_START_DOCSTRING,
+)
+class TFDPRReader(TFDPRPretrainedReader):
+    def __init__(self, config: DPRConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.span_predictor = TFDPRSpanPredictorLayer(config, name="span_predictor")
+
+    def get_input_embeddings(self):
+        try:
+            return self.span_predictor.encoder.bert_model.get_input_embeddings()
+        except AttributeError:
+            self.build()
+            return self.span_predictor.encoder.bert_model.get_input_embeddings()
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TF_DPR_READER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFDPRReaderOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFDPRReaderOutput | tuple[tf.Tensor, ...]:
+        r"""
+        Return:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TFDPRReader, DPRReaderTokenizer
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = TFDPRReader.from_pretrained("facebook/dpr-reader-single-nq-base", from_pt=True)
+        >>> encoded_inputs = tokenizer(
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="tf",
+        ... )
+        >>> outputs = model(encoded_inputs)
+        >>> start_logits = outputs.start_logits
+        >>> end_logits = outputs.end_logits
+        >>> relevance_logits = outputs.relevance_logits
+        ```
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.ones(input_shape, dtype=tf.dtypes.int32)
+
+        return self.span_predictor(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "span_predictor", None) is not None:
+            with tf.name_scope(self.span_predictor.name):
+                self.span_predictor.build(None)
+
+
+__all__ = [
+    "TFDPRContextEncoder",
+    "TFDPRPretrainedContextEncoder",
+    "TFDPRPretrainedQuestionEncoder",
+    "TFDPRPretrainedReader",
+    "TFDPRQuestionEncoder",
+    "TFDPRReader",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/tokenization_dpr.py b/phivenv/Lib/site-packages/transformers/models/dpr/tokenization_dpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..020b235cb6bd97bda74f2e067294a9391617e00f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpr/tokenization_dpr.py
@@ -0,0 +1,321 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DPR."""
+
+import collections
+from typing import Optional, Union
+
+from ...tokenization_utils_base import BatchEncoding
+from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
+from ..bert.tokenization_bert import BertTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class DPRContextEncoderTokenizer(BertTokenizer):
+    r"""
+    Construct a DPRContextEncoder tokenizer.
+
+    [`DPRContextEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
+    splitting and wordpiece.
+
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+
+class DPRQuestionEncoderTokenizer(BertTokenizer):
+    r"""
+    Constructs a DPRQuestionEncoder tokenizer.
+
+    [`DPRQuestionEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
+    splitting and wordpiece.
+
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+
+DPRSpanPrediction = collections.namedtuple(
+    "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"]
+)
+
+DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"])
+
+
+CUSTOM_DPR_READER_DOCSTRING = r"""
+    Return a dictionary with the token ids of the input strings and other information to give to `.decode_best_spans`.
+    It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers),
+    using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size `(n_passages, sequence_length)`
+    with the format:
+
+    ```
+    [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
+    ```
+
+    Args:
+        questions (`str` or `list[str]`):
+            The questions to be encoded. You can specify one question for many passages. In this case, the question
+            will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as in
+            `titles` or `texts`.
+        titles (`str` or `list[str]`):
+            The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
+        texts (`str` or `list[str]`):
+            The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+            Activates and controls padding. Accepts the following values:
+
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
+              if provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+              lengths).
+        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+            Activates and controls truncation. Accepts the following values:
+
+            - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to
+              the maximum acceptable input length for the model if that argument is not provided. This will truncate
+              token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch
+              of pairs) is provided.
+            - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided. This will only truncate the first
+              sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+            - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided. This will only truncate the
+              second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+            - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+              greater than the model maximum admissible input size).
+        max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+        return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+        return_attention_mask (`bool`, *optional*):
+            Whether or not to return the attention mask. If not set, will return the attention mask according to the
+            specific tokenizer's default, defined by the `return_outputs` attribute.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+    Returns:
+        `dict[str, list[list[int]]]`: A dictionary with the following keys:
+
+        - `input_ids`: List of token ids to be fed to a model.
+        - `attention_mask`: List of indices specifying which tokens should be attended to by the model.
+    """
+
+
+@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING)
+class CustomDPRReaderTokenizerMixin:
+    def __call__(
+        self,
+        questions,
+        titles: Optional[str] = None,
+        texts: Optional[str] = None,
+        padding: Union[bool, str] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        if titles is None and texts is None:
+            return super().__call__(
+                questions,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                return_attention_mask=return_attention_mask,
+                **kwargs,
+            )
+        elif titles is None or texts is None:
+            text_pair = titles if texts is None else texts
+            return super().__call__(
+                questions,
+                text_pair,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                return_attention_mask=return_attention_mask,
+                **kwargs,
+            )
+        titles = titles if not isinstance(titles, str) else [titles]
+        texts = texts if not isinstance(texts, str) else [texts]
+        n_passages = len(titles)
+        questions = questions if not isinstance(questions, str) else [questions] * n_passages
+        if len(titles) != len(texts):
+            raise ValueError(
+                f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+            )
+        encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
+        encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
+        encoded_inputs = {
+            "input_ids": [
+                (encoded_question_and_title + encoded_text)[:max_length]
+                if max_length is not None and truncation
+                else encoded_question_and_title + encoded_text
+                for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts)
+            ]
+        }
+        if return_attention_mask is not False:
+            attention_mask = []
+            for input_ids in encoded_inputs["input_ids"]:
+                attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids])
+            encoded_inputs["attention_mask"] = attention_mask
+        return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors)
+
+    def decode_best_spans(
+        self,
+        reader_input: BatchEncoding,
+        reader_output: DPRReaderOutput,
+        num_spans: int = 16,
+        max_answer_length: int = 64,
+        num_spans_per_passage: int = 4,
+    ) -> list[DPRSpanPrediction]:
+        """
+        Get the span predictions for the extractive Q&A model.
+
+        Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each
+        *DPRReaderOutput* is a *Tuple* with:
+
+            - **span_score**: `float` that corresponds to the score given by the reader for this span compared to other
+              spans in the same passage. It corresponds to the sum of the start and end logits of the span.
+            - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question,
+              compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
+            - **doc_id**: `int` the id of the passage. - **start_index**: `int` the start index of the span
+              (inclusive). - **end_index**: `int` the end index of the span (inclusive).
+
+        Examples:
+
+        ```python
+        >>> from transformers import DPRReader, DPRReaderTokenizer
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> encoded_inputs = tokenizer(
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="pt",
+        ... )
+        >>> outputs = model(**encoded_inputs)
+        >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
+        >>> print(predicted_spans[0].text)  # best span
+        a song
+        ```"""
+        input_ids = reader_input["input_ids"]
+        start_logits, end_logits, relevance_logits = reader_output[:3]
+        n_passages = len(relevance_logits)
+        sorted_docs = sorted(range(n_passages), reverse=True, key=relevance_logits.__getitem__)
+        nbest_spans_predictions: list[DPRReaderOutput] = []
+        for doc_id in sorted_docs:
+            sequence_ids = list(input_ids[doc_id])
+            # assuming question & title information is at the beginning of the sequence
+            passage_offset = sequence_ids.index(self.sep_token_id, 2) + 1  # second sep id
+            if sequence_ids[-1] == self.pad_token_id:
+                sequence_len = sequence_ids.index(self.pad_token_id)
+            else:
+                sequence_len = len(sequence_ids)
+
+            best_spans = self._get_best_spans(
+                start_logits=start_logits[doc_id][passage_offset:sequence_len],
+                end_logits=end_logits[doc_id][passage_offset:sequence_len],
+                max_answer_length=max_answer_length,
+                top_spans=num_spans_per_passage,
+            )
+            for start_index, end_index in best_spans:
+                start_index += passage_offset
+                end_index += passage_offset
+                nbest_spans_predictions.append(
+                    DPRSpanPrediction(
+                        span_score=start_logits[doc_id][start_index] + end_logits[doc_id][end_index],
+                        relevance_score=relevance_logits[doc_id],
+                        doc_id=doc_id,
+                        start_index=start_index,
+                        end_index=end_index,
+                        text=self.decode(sequence_ids[start_index : end_index + 1]),
+                    )
+                )
+            if len(nbest_spans_predictions) >= num_spans:
+                break
+        return nbest_spans_predictions[:num_spans]
+
+    def _get_best_spans(
+        self,
+        start_logits: list[int],
+        end_logits: list[int],
+        max_answer_length: int,
+        top_spans: int,
+    ) -> list[DPRSpanPrediction]:
+        """
+        Finds the best answer span for the extractive Q&A model for one passage. It returns the best span by descending
+        `span_score` order and keeping max `top_spans` spans. Spans longer that `max_answer_length` are ignored.
+        """
+        scores = []
+        for start_index, start_score in enumerate(start_logits):
+            for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]):
+                scores.append(((start_index, start_index + answer_length), start_score + end_score))
+        scores = sorted(scores, key=lambda x: x[1], reverse=True)
+        chosen_span_intervals = []
+        for (start_index, end_index), score in scores:
+            if start_index > end_index:
+                raise ValueError(f"Wrong span indices: [{start_index}:{end_index}]")
+            length = end_index - start_index + 1
+            if length > max_answer_length:
+                raise ValueError(f"Span is too long: {length} > {max_answer_length}")
+            if any(
+                start_index <= prev_start_index <= prev_end_index <= end_index
+                or prev_start_index <= start_index <= end_index <= prev_end_index
+                for (prev_start_index, prev_end_index) in chosen_span_intervals
+            ):
+                continue
+            chosen_span_intervals.append((start_index, end_index))
+
+            if len(chosen_span_intervals) == top_spans:
+                break
+        return chosen_span_intervals
+
+
+@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
+class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
+    r"""
+    Construct a DPRReader tokenizer.
+
+    [`DPRReaderTokenizer`] is almost identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation
+    splitting and wordpiece. The difference is that is has three inputs strings: question, titles and texts that are
+    combined to be fed to the [`DPRReader`] model.
+
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+
+__all__ = ["DPRContextEncoderTokenizer", "DPRQuestionEncoderTokenizer", "DPRReaderOutput", "DPRReaderTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpr/tokenization_dpr_fast.py b/phivenv/Lib/site-packages/transformers/models/dpr/tokenization_dpr_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbf745291745c3ac29472391822b09ba68d933a4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpr/tokenization_dpr_fast.py
@@ -0,0 +1,321 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DPR."""
+
+import collections
+from typing import Optional, Union
+
+from ...tokenization_utils_base import BatchEncoding
+from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
+from ..bert.tokenization_bert_fast import BertTokenizerFast
+from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class DPRContextEncoderTokenizerFast(BertTokenizerFast):
+    r"""
+    Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    [`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
+    punctuation splitting and wordpiece.
+
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = DPRContextEncoderTokenizer
+
+
+class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
+    r"""
+    Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    [`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
+    punctuation splitting and wordpiece.
+
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = DPRQuestionEncoderTokenizer
+
+
+DPRSpanPrediction = collections.namedtuple(
+    "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"]
+)
+
+DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"])
+
+
+CUSTOM_DPR_READER_DOCSTRING = r"""
+    Return a dictionary with the token ids of the input strings and other information to give to `.decode_best_spans`.
+    It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers),
+    using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size `(n_passages, sequence_length)`
+    with the format:
+
+    [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
+
+    Args:
+        questions (`str` or `list[str]`):
+            The questions to be encoded. You can specify one question for many passages. In this case, the question
+            will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as in
+            `titles` or `texts`.
+        titles (`str` or `list[str]`):
+            The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
+        texts (`str` or `list[str]`):
+            The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+            Activates and controls padding. Accepts the following values:
+
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
+              if provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+              lengths).
+        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+            Activates and controls truncation. Accepts the following values:
+
+            - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to
+              the maximum acceptable input length for the model if that argument is not provided. This will truncate
+              token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch
+              of pairs) is provided.
+            - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided. This will only truncate the first
+              sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+            - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided. This will only truncate the
+              second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+            - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+              greater than the model maximum admissible input size).
+        max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+        return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+        return_attention_mask (`bool`, *optional*):
+            Whether or not to return the attention mask. If not set, will return the attention mask according to the
+            specific tokenizer's default, defined by the `return_outputs` attribute.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+    Return:
+        `dict[str, list[list[int]]]`: A dictionary with the following keys:
+
+        - `input_ids`: List of token ids to be fed to a model.
+        - `attention_mask`: List of indices specifying which tokens should be attended to by the model.
+    """
+
+
+@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING)
+class CustomDPRReaderTokenizerMixin:
+    def __call__(
+        self,
+        questions,
+        titles: Optional[str] = None,
+        texts: Optional[str] = None,
+        padding: Union[bool, str] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        if titles is None and texts is None:
+            return super().__call__(
+                questions,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                return_attention_mask=return_attention_mask,
+                **kwargs,
+            )
+        elif titles is None or texts is None:
+            text_pair = titles if texts is None else texts
+            return super().__call__(
+                questions,
+                text_pair,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                return_attention_mask=return_attention_mask,
+                **kwargs,
+            )
+        titles = titles if not isinstance(titles, str) else [titles]
+        texts = texts if not isinstance(texts, str) else [texts]
+        n_passages = len(titles)
+        questions = questions if not isinstance(questions, str) else [questions] * n_passages
+        assert len(titles) == len(texts), (
+            f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+        )
+        encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
+        encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
+        encoded_inputs = {
+            "input_ids": [
+                (encoded_question_and_title + encoded_text)[:max_length]
+                if max_length is not None and truncation
+                else encoded_question_and_title + encoded_text
+                for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts)
+            ]
+        }
+        if return_attention_mask is not False:
+            attention_mask = []
+            for input_ids in encoded_inputs["input_ids"]:
+                attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids])
+            encoded_inputs["attention_mask"] = attention_mask
+        return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors)
+
+    def decode_best_spans(
+        self,
+        reader_input: BatchEncoding,
+        reader_output: DPRReaderOutput,
+        num_spans: int = 16,
+        max_answer_length: int = 64,
+        num_spans_per_passage: int = 4,
+    ) -> list[DPRSpanPrediction]:
+        """
+        Get the span predictions for the extractive Q&A model.
+
+        Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each
+        *DPRReaderOutput* is a *Tuple* with:
+
+            - **span_score**: `float` that corresponds to the score given by the reader for this span compared to other
+              spans in the same passage. It corresponds to the sum of the start and end logits of the span.
+            - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question,
+              compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
+            - **doc_id**: `int` the id of the passage. - ***start_index**: `int` the start index of the span
+              (inclusive). - **end_index**: `int` the end index of the span (inclusive).
+
+        Examples:
+
+        ```python
+        >>> from transformers import DPRReader, DPRReaderTokenizer
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> encoded_inputs = tokenizer(
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="pt",
+        ... )
+        >>> outputs = model(**encoded_inputs)
+        >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
+        >>> print(predicted_spans[0].text)  # best span
+        a song
+        ```"""
+        input_ids = reader_input["input_ids"]
+        start_logits, end_logits, relevance_logits = reader_output[:3]
+        n_passages = len(relevance_logits)
+        sorted_docs = sorted(range(n_passages), reverse=True, key=relevance_logits.__getitem__)
+        nbest_spans_predictions: list[DPRReaderOutput] = []
+        for doc_id in sorted_docs:
+            sequence_ids = list(input_ids[doc_id])
+            # assuming question & title information is at the beginning of the sequence
+            passage_offset = sequence_ids.index(self.sep_token_id, 2) + 1  # second sep id
+            if sequence_ids[-1] == self.pad_token_id:
+                sequence_len = sequence_ids.index(self.pad_token_id)
+            else:
+                sequence_len = len(sequence_ids)
+
+            best_spans = self._get_best_spans(
+                start_logits=start_logits[doc_id][passage_offset:sequence_len],
+                end_logits=end_logits[doc_id][passage_offset:sequence_len],
+                max_answer_length=max_answer_length,
+                top_spans=num_spans_per_passage,
+            )
+            for start_index, end_index in best_spans:
+                start_index += passage_offset
+                end_index += passage_offset
+                nbest_spans_predictions.append(
+                    DPRSpanPrediction(
+                        span_score=start_logits[doc_id][start_index] + end_logits[doc_id][end_index],
+                        relevance_score=relevance_logits[doc_id],
+                        doc_id=doc_id,
+                        start_index=start_index,
+                        end_index=end_index,
+                        text=self.decode(sequence_ids[start_index : end_index + 1]),
+                    )
+                )
+            if len(nbest_spans_predictions) >= num_spans:
+                break
+        return nbest_spans_predictions[:num_spans]
+
+    def _get_best_spans(
+        self,
+        start_logits: list[int],
+        end_logits: list[int],
+        max_answer_length: int,
+        top_spans: int,
+    ) -> list[DPRSpanPrediction]:
+        """
+        Finds the best answer span for the extractive Q&A model for one passage. It returns the best span by descending
+        `span_score` order and keeping max `top_spans` spans. Spans longer that `max_answer_length` are ignored.
+        """
+        scores = []
+        for start_index, start_score in enumerate(start_logits):
+            for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]):
+                scores.append(((start_index, start_index + answer_length), start_score + end_score))
+        scores = sorted(scores, key=lambda x: x[1], reverse=True)
+        chosen_span_intervals = []
+        for (start_index, end_index), score in scores:
+            assert start_index <= end_index, f"Wrong span indices: [{start_index}:{end_index}]"
+            length = end_index - start_index + 1
+            assert length <= max_answer_length, f"Span is too long: {length} > {max_answer_length}"
+            if any(
+                start_index <= prev_start_index <= prev_end_index <= end_index
+                or prev_start_index <= start_index <= end_index <= prev_end_index
+                for (prev_start_index, prev_end_index) in chosen_span_intervals
+            ):
+                continue
+            chosen_span_intervals.append((start_index, end_index))
+
+            if len(chosen_span_intervals) == top_spans:
+                break
+        return chosen_span_intervals
+
+
+@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
+class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
+    r"""
+    Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    [`DPRReaderTokenizerFast`] is almost identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
+    punctuation splitting and wordpiece. The difference is that is has three inputs strings: question, titles and texts
+    that are combined to be fed to the [`DPRReader`] model.
+
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
+
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = DPRReaderTokenizer
+
+
+__all__ = ["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__init__.py b/phivenv/Lib/site-packages/transformers/models/dpt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce0070f270f3604afd0661e0cd8aaa4fa2141217
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpt/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dpt import *
+    from .feature_extraction_dpt import *
+    from .image_processing_dpt import *
+    from .image_processing_dpt_fast import *
+    from .modeling_dpt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8d2aa3a308d7dac1f870ebcd6815aac7ef80dba
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..622e3de01d6ce814e50d0f84cc4219d24371a30d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c189ab1cf540f106f5b0befe73b5c019f327568
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62a33038e20565f6866967347d8448ab6265813f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c155bebd0e708498aa8780ca1e3544eed49c843
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52cd6e4c6b5572c6a306ea095ce1d5cc96bf4c68
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c0ba2036a0fc4e2d2554914d3f9a5db2bb29e91
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/configuration_dpt.py b/phivenv/Lib/site-packages/transformers/models/dpt/configuration_dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..70e46f2320223b595797cff6946bd9987f477a1f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpt/configuration_dpt.py
@@ -0,0 +1,304 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DPT model configuration"""
+
+import copy
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto.configuration_auto import CONFIG_MAPPING
+from ..bit import BitConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DPTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DPT
+    [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 384):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        is_hybrid (`bool`, *optional*, defaults to `False`):
+            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        backbone_out_indices (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
+            Indices of the intermediate hidden states to use from backbone.
+        readout_type (`str`, *optional*, defaults to `"project"`):
+            The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
+            the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
+
+            - "ignore" simply ignores the CLS token.
+            - "add" passes the information from the CLS token to all other tokens by adding the representations.
+            - "project" passes information to the other tokens by concatenating the readout to all other tokens before
+              projecting the
+            representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
+        reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
+            The up/downsampling factors of the reassemble layers.
+        neck_hidden_sizes (`list[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
+            The hidden sizes to project to for the feature maps of the backbone.
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
+        head_in_index (`int`, *optional*, defaults to -1):
+            The index of the features to use in the heads.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the pre-activate residual units of the fusion blocks.
+        add_projection (`bool`, *optional*, defaults to `False`):
+            Whether to add a projection layer before the depth estimation head.
+        use_auxiliary_head (`bool`, *optional*, defaults to `True`):
+            Whether to use an auxiliary head during training.
+        auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
+            Weight of the cross-entropy loss of the auxiliary head.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+        semantic_classifier_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the semantic classification head.
+        backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
+            Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
+        neck_ignore_stages (`list[int]`, *optional*, defaults to `[0, 1]`):
+            Used only for the `hybrid` embedding type. The stages of the readout layers to ignore.
+        backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
+            leverage the [`AutoBackbone`] API.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
+
+    Example:
+
+    ```python
+    >>> from transformers import DPTModel, DPTConfig
+
+    >>> # Initializing a DPT dpt-large style configuration
+    >>> configuration = DPTConfig()
+
+    >>> # Initializing a model from the dpt-large style configuration
+    >>> model = DPTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dpt"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=384,
+        patch_size=16,
+        num_channels=3,
+        is_hybrid=False,
+        qkv_bias=True,
+        backbone_out_indices=[2, 5, 8, 11],
+        readout_type="project",
+        reassemble_factors=[4, 2, 1, 0.5],
+        neck_hidden_sizes=[96, 192, 384, 768],
+        fusion_hidden_size=256,
+        head_in_index=-1,
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=None,
+        add_projection=False,
+        use_auxiliary_head=True,
+        auxiliary_loss_weight=0.4,
+        semantic_loss_ignore_index=255,
+        semantic_classifier_dropout=0.1,
+        backbone_featmap_shape=[1, 1024, 24, 24],
+        neck_ignore_stages=[0, 1],
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        pooler_output_size=None,
+        pooler_act="tanh",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.is_hybrid = is_hybrid
+
+        use_autobackbone = False
+        if self.is_hybrid:
+            if backbone_config is None:
+                backbone_config = {
+                    "global_padding": "same",
+                    "layer_type": "bottleneck",
+                    "depths": [3, 4, 9],
+                    "out_features": ["stage1", "stage2", "stage3"],
+                    "embedding_dynamic_padding": True,
+                }
+
+            if isinstance(backbone_config, dict):
+                logger.info("Initializing the config with a `BiT` backbone.")
+                backbone_config = BitConfig(**backbone_config)
+            elif isinstance(backbone_config, PretrainedConfig):
+                backbone_config = backbone_config
+            else:
+                raise ValueError(
+                    f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}."
+                )
+            self.backbone_config = backbone_config
+            self.backbone_featmap_shape = backbone_featmap_shape
+            self.neck_ignore_stages = neck_ignore_stages
+
+            if readout_type != "project":
+                raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.")
+
+        elif backbone is not None or backbone_config is not None:
+            use_autobackbone = True
+            if isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.get("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+
+            self.backbone_config = backbone_config
+            self.backbone_featmap_shape = None
+            self.neck_ignore_stages = []
+
+            # We only use load_backbone when config.is_hydrid is False
+            verify_backbone_config_arguments(
+                use_timm_backbone=use_timm_backbone,
+                use_pretrained_backbone=use_pretrained_backbone,
+                backbone=backbone,
+                backbone_config=backbone_config,
+                backbone_kwargs=backbone_kwargs,
+            )
+        else:
+            self.backbone_config = None
+            self.backbone_featmap_shape = None
+            self.neck_ignore_stages = []
+
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+
+        # ViT parameters used if not using a hybrid backbone
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.use_autobackbone = use_autobackbone
+        self.backbone_out_indices = None if use_autobackbone else backbone_out_indices
+
+        if readout_type not in ["ignore", "add", "project"]:
+            raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.readout_type = readout_type
+        self.reassemble_factors = reassemble_factors
+        self.neck_hidden_sizes = neck_hidden_sizes
+        self.fusion_hidden_size = fusion_hidden_size
+        self.head_in_index = head_in_index
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
+        self.add_projection = add_projection
+
+        # auxiliary head attributes (semantic segmentation)
+        self.use_auxiliary_head = use_auxiliary_head
+        self.auxiliary_loss_weight = auxiliary_loss_weight
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+        self.semantic_classifier_dropout = semantic_classifier_dropout
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+
+        output["model_type"] = self.__class__.model_type
+        return output
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+
+__all__ = ["DPTConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/feature_extraction_dpt.py b/phivenv/Lib/site-packages/transformers/models/dpt/feature_extraction_dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ab8ccbed8d33b1e5b15d429b6cb057ff781f78
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpt/feature_extraction_dpt.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DPT."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_dpt import DPTImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class DPTFeatureExtractor(DPTImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class DPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use DPTImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["DPTFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/image_processing_dpt.py b/phivenv/Lib/site-packages/transformers/models/dpt/image_processing_dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9535a700c728cd9ff2f9b9dd48cc53185db9cde4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpt/image_processing_dpt.py
@@ -0,0 +1,679 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DPT."""
+
+import math
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+from ...utils.import_utils import requires
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    is_torch_tensor,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    output_size: Union[int, Iterable[int]],
+    keep_aspect_ratio: bool,
+    multiple: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
+        x = round(val / multiple) * multiple
+
+        if max_val is not None and x > max_val:
+            x = math.floor(val / multiple) * multiple
+
+        if x < min_val:
+            x = math.ceil(val / multiple) * multiple
+
+        return x
+
+    output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
+
+    input_height, input_width = get_image_size(input_image, input_data_format)
+    output_height, output_width = output_size
+
+    # determine new height and width
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            # fit width
+            scale_height = scale_width
+        else:
+            # fit height
+            scale_width = scale_height
+
+    new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
+
+    return (new_height, new_width)
+
+
+@requires(backends=("vision",))
+class DPTImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DPT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions. Can be overridden by `do_resize` in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Defines the resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+            If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
+            be overridden by `keep_aspect_ratio` in `preprocess`.
+        ensure_multiple_of (`int`, *optional*, defaults to 1):
+            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
+            by `ensure_multiple_of` in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
+            combination with DPT.
+        size_divisor (`int`, *optional*):
+            If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
+            DINOv2 paper, which uses the model in combination with DPT.
+        do_reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
+            used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
+            background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
+            `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: bool = False,
+        size_divisor: Optional[int] = None,
+        do_reduce_labels: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.size = size
+        self.keep_aspect_ratio = keep_aspect_ratio
+        self.ensure_multiple_of = ensure_multiple_of
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_pad = do_pad
+        self.size_divisor = size_divisor
+        self.do_reduce_labels = do_reduce_labels
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
+        is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
+        set, the image is resized to a size that is a multiple of this value.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Target size of the output image.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+            ensure_multiple_of (`int`, *optional*, defaults to 1):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size
+                specified in `size`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+
+        output_size = get_resize_output_image_size(
+            image,
+            output_size=(size["height"], size["width"]),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def pad_image(
+        self,
+        image: np.array,
+        size_divisor: int,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Center pad an image to be a multiple of `multiple`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            size_divisor (`int`):
+                The width and height of the image will be padded to a multiple of this number.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+
+        def _get_pad(size, size_divisor):
+            new_size = math.ceil(size / size_divisor) * size_divisor
+            pad_size = new_size - size
+            pad_size_left = pad_size // 2
+            pad_size_right = pad_size - pad_size_left
+            return pad_size_left, pad_size_right
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        height, width = get_image_size(image, input_data_format)
+
+        pad_size_left, pad_size_right = _get_pad(height, size_divisor)
+        pad_size_top, pad_size_bottom = _get_pad(width, size_divisor)
+
+        return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format)
+
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
+    def reduce_label(self, label: ImageInput) -> np.ndarray:
+        label = to_numpy_array(label)
+        # Avoid using underflow conversion
+        label[label == 0] = 255
+        label = label - 1
+        label[label == 254] = 255
+        return label
+
+    def _preprocess(
+        self,
+        image: ImageInput,
+        do_reduce_labels: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        if do_reduce_labels:
+            image = self.reduce_label(image)
+
+        if do_resize:
+            image = self.resize(
+                image=image,
+                size=size,
+                resample=resample,
+                keep_aspect_ratio=keep_aspect_ratio,
+                ensure_multiple_of=ensure_multiple_of,
+                input_data_format=input_data_format,
+            )
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+        if do_pad:
+            image = self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format)
+
+        return image
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(image)
+
+        image = self._preprocess(
+            image,
+            do_reduce_labels=False,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            keep_aspect_ratio=keep_aspect_ratio,
+            ensure_multiple_of=ensure_multiple_of,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_pad=do_pad,
+            size_divisor=size_divisor,
+            input_data_format=input_data_format,
+        )
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def _preprocess_segmentation_map(
+        self,
+        segmentation_map: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        do_reduce_labels: Optional[bool] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """Preprocesses a single segmentation map."""
+        # All transformations expect numpy arrays.
+        segmentation_map = to_numpy_array(segmentation_map)
+        # Add an axis to the segmentation maps for transformations.
+        if segmentation_map.ndim == 2:
+            segmentation_map = segmentation_map[None, ...]
+            added_dimension = True
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_dimension = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
+        segmentation_map = self._preprocess(
+            image=segmentation_map,
+            do_reduce_labels=do_reduce_labels,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            keep_aspect_ratio=keep_aspect_ratio,
+            ensure_multiple_of=ensure_multiple_of,
+            do_normalize=False,
+            do_rescale=False,
+            input_data_format=input_data_format,
+        )
+        # Remove extra axis if added
+        if added_dimension:
+            segmentation_map = np.squeeze(segmentation_map, axis=0)
+        segmentation_map = segmentation_map.astype(np.int64)
+        return segmentation_map
+
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.__call__
+    def __call__(self, images, segmentation_maps=None, **kwargs):
+        # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
+        # be passed in as positional arguments.
+        return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[int] = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
+        do_reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            segmentation_maps (`ImageInput`, *optional*):
+                Segmentation map to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after reszing. If `keep_aspect_ratio` is `True`, the image is resized to the largest
+                possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is set, the image is
+                resized to a size that is a multiple of this value.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`):
+                Whether to keep the aspect ratio of the image. If False, the image will be resized to (size, size). If
+                True, the image will be resized to keep the aspect ratio and the size will be the maximum possible.
+            ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`):
+                Ensure that the image size is a multiple of this value.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+                Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+                is used for background, and background itself is not included in all classes of a dataset (e.g.
+                ADE20k). The background label will be replaced by 255.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        keep_aspect_ratio = keep_aspect_ratio if keep_aspect_ratio is not None else self.keep_aspect_ratio
+        ensure_multiple_of = ensure_multiple_of if ensure_multiple_of is not None else self.ensure_multiple_of
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
+
+        images = make_list_of_images(images)
+
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_pad=do_pad,
+            size_divisibility=size_divisor,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        images = [
+            self._preprocess_image(
+                image=img,
+                do_resize=do_resize,
+                do_rescale=do_rescale,
+                do_normalize=do_normalize,
+                do_pad=do_pad,
+                size=size,
+                resample=resample,
+                keep_aspect_ratio=keep_aspect_ratio,
+                ensure_multiple_of=ensure_multiple_of,
+                rescale_factor=rescale_factor,
+                image_mean=image_mean,
+                image_std=image_std,
+                size_divisor=size_divisor,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for img in images
+        ]
+
+        data = {"pixel_values": images}
+
+        if segmentation_maps is not None:
+            segmentation_maps = [
+                self._preprocess_segmentation_map(
+                    segmentation_map=segmentation_map,
+                    do_reduce_labels=do_reduce_labels,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    keep_aspect_ratio=keep_aspect_ratio,
+                    ensure_multiple_of=ensure_multiple_of,
+                    input_data_format=input_data_format,
+                )
+                for segmentation_map in segmentation_maps
+            ]
+
+            data["labels"] = segmentation_maps
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT
+    def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None):
+        """
+        Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`DPTForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
+        Returns:
+            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        # TODO: add support for other frameworks
+        logits = outputs.logits
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            if is_torch_tensor(target_sizes):
+                target_sizes = target_sizes.numpy()
+
+            semantic_segmentation = []
+
+            for idx in range(len(logits)):
+                resized_logits = torch.nn.functional.interpolate(
+                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
+    ) -> list[dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                depth = torch.nn.functional.interpolate(
+                    depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
+                ).squeeze()
+
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["DPTImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/image_processing_dpt_fast.py b/phivenv/Lib/site-packages/transformers/models/dpt/image_processing_dpt_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..acfa82c4694af9e29d8b168238628be2f5ec5744
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpt/image_processing_dpt_fast.py
@@ -0,0 +1,422 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dpt/modular_dpt.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dpt.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+from ...image_processing_base import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_transforms import group_images_by_shape, reorder_images
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    is_torch_tensor,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    requires_backends,
+)
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    ensure_multiple_of (`int`, *optional*, defaults to 1):
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+        by `ensure_multiple_of` in `preprocess`.
+    do_pad (`bool`, *optional*, defaults to `False`):
+        Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
+        combination with DPT.
+    size_divisor (`int`, *optional*):
+        If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
+        DINOv2 paper, which uses the model in combination with DPT.
+    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
+        be overidden by `keep_aspect_ratio` in `preprocess`.
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    ensure_multiple_of: Optional[int]
+    size_divisor: Optional[int]
+    do_pad: Optional[bool]
+    keep_aspect_ratio: Optional[bool]
+    do_reduce_labels: Optional[bool]
+
+
+def get_resize_output_image_size(
+    input_image: "torch.Tensor",
+    output_size: Union[int, Iterable[int]],
+    keep_aspect_ratio: bool,
+    multiple: int,
+) -> SizeDict:
+    def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
+        x = round(val / multiple) * multiple
+
+        if max_val is not None and x > max_val:
+            x = math.floor(val / multiple) * multiple
+
+        if x < min_val:
+            x = math.ceil(val / multiple) * multiple
+
+        return x
+
+    input_height, input_width = input_image.shape[-2:]
+    output_height, output_width = output_size
+
+    # determine new height and width
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            # fit width
+            scale_height = scale_width
+        else:
+            # fit height
+            scale_width = scale_height
+
+    new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
+
+    return SizeDict(height=new_height, width=new_width)
+
+
+@auto_docstring
+class DPTImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 384, "width": 384}
+    default_to_square = True
+    crop_size = None
+    do_resize = True
+    do_center_crop = None
+    do_rescale = True
+    do_normalize = True
+    do_reduce_labels = None
+
+    valid_kwargs = DPTFastImageProcessorKwargs
+    do_pad = False
+    rescale_factor = 1 / 255
+    ensure_multiple_of = 1
+    keep_aspect_ratio = False
+
+    def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def reduce_label(self, labels: list["torch.Tensor"]):
+        for idx in range(len(labels)):
+            label = labels[idx]
+            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype), label)
+            label = label - 1
+            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype), label)
+            labels[idx] = label
+
+        return label
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        **kwargs: Unpack[DPTFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to preprocess.
+        """
+        return super().preprocess(images, segmentation_maps, **kwargs)
+
+    def _preprocess_image_like_inputs(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput],
+        do_convert_rgb: bool,
+        input_data_format: ChannelDimension,
+        device: Optional[Union[str, "torch.device"]] = None,
+        **kwargs: Unpack[DPTFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Preprocess image-like inputs.
+        """
+        images = self._prepare_image_like_inputs(
+            images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+        )
+        images_kwargs = kwargs.copy()
+        images_kwargs["do_reduce_labels"] = False
+        batch_feature = self._preprocess(images, **images_kwargs)
+
+        if segmentation_maps is not None:
+            processed_segmentation_maps = self._prepare_image_like_inputs(
+                images=segmentation_maps,
+                expected_ndims=2,
+                do_convert_rgb=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+
+            segmentation_maps_kwargs = kwargs.copy()
+            segmentation_maps_kwargs.update({"do_normalize": False, "do_rescale": False})
+            processed_segmentation_maps = self._preprocess(
+                images=processed_segmentation_maps, **segmentation_maps_kwargs
+            ).pixel_values
+            batch_feature["labels"] = processed_segmentation_maps.squeeze(1).to(torch.int64)
+
+        return batch_feature
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_reduce_labels: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        keep_aspect_ratio: bool,
+        ensure_multiple_of: Optional[int],
+        do_pad: bool,
+        size_divisor: Optional[int],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        if do_reduce_labels:
+            images = self.reduce_label(images)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation,
+                    ensure_multiple_of=ensure_multiple_of,
+                    keep_aspect_ratio=keep_aspect_ratio,
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            if do_pad:
+                stacked_images = self.pad_image(stacked_images, size_divisor)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(data={"pixel_values": processed_images})
+
+    def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None):
+        """
+        Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`DPTForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
+        Returns:
+            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        # TODO: add support for other frameworks
+        logits = outputs.logits
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            if is_torch_tensor(target_sizes):
+                target_sizes = target_sizes.numpy()
+
+            semantic_segmentation = []
+
+            for idx in range(len(logits)):
+                resized_logits = torch.nn.functional.interpolate(
+                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        ensure_multiple_of: Optional[int] = 1,
+        keep_aspect_ratio: bool = False,
+    ) -> "torch.Tensor":
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to use antialiasing when resizing the image
+            ensure_multiple_of (`int`, *optional*):
+                If `do_resize` is `True`, the image is resized to a size that is a multiple of this value
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, and `do_resize` is `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        if not size.height or not size.width:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+
+        output_size = get_resize_output_image_size(
+            image,
+            output_size=(size.height, size.width),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+        )
+        return super().resize(image, output_size, interpolation=interpolation, antialias=antialias)
+
+    def pad_image(
+        self,
+        image: "torch.Tensor",
+        size_divisor: int = 1,
+    ) -> "torch.Tensor":
+        r"""
+        Center pad a batch of images to be a multiple of `size_divisor`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to pad.  Can be a batch of images of dimensions (N, C, H, W) or a single image of dimensions (C, H, W).
+            size_divisor (`int`):
+                The width and height of the image will be padded to a multiple of this number.
+        """
+        height, width = image.shape[-2:]
+
+        def _get_pad(size, size_divisor):
+            new_size = math.ceil(size / size_divisor) * size_divisor
+            pad_size = new_size - size
+            pad_size_left = pad_size // 2
+            pad_size_right = pad_size - pad_size_left
+            return pad_size_left, pad_size_right
+
+        pad_top, pad_bottom = _get_pad(height, size_divisor)
+        pad_left, pad_right = _get_pad(width, size_divisor)
+        padding = (pad_left, pad_top, pad_right, pad_bottom)
+        return F.pad(image, padding)
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
+    ) -> list[dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                depth = torch.nn.functional.interpolate(
+                    depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
+                ).squeeze()
+
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["DPTImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/modeling_dpt.py b/phivenv/Lib/site-packages/transformers/models/dpt/modeling_dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..f797f53aa4b660ecfdba024f3302d09f65368629
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpt/modeling_dpt.py
@@ -0,0 +1,1226 @@
+# coding=utf-8
+# Copyright 2022 Intel Labs, OpenMMLab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DPT (Dense Prediction Transformers) model.
+
+This implementation is heavily inspired by OpenMMLab's implementation, found here:
+https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.
+
+"""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from ...utils.backbone_utils import load_backbone
+from ...utils.generic import can_return_tuple, check_model_inputs
+from .configuration_dpt import DPTConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
+    in the context of Vision models.:
+    """
+)
+class BaseModelOutputWithIntermediateActivations(ModelOutput):
+    r"""
+    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
+        Intermediate activations that can be used to compute hidden states of the model at various layers.
+    """
+
+    last_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_activations: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
+    activations that can be used by the model at later stages.
+    """
+)
+class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+        Last layer hidden-state of the first token of the sequence (classification token) after further processing
+        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+        the classification token after processing through a linear layer and a tanh activation function. The linear
+        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
+        Intermediate activations that can be used to compute hidden states of the model at various layers.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    intermediate_activations: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class DPTViTHybridEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config: DPTConfig, feature_size: Optional[tuple[int, int]] = None):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.backbone = load_backbone(config)
+        feature_dim = self.backbone.channels[-1]
+        if len(self.backbone.channels) != 3:
+            raise ValueError(f"Expected backbone to have 3 output features, got {len(self.backbone.channels)}")
+        self.residual_feature_map_index = [0, 1]  # Always take the output of the first and second backbone stage
+
+        if feature_size is None:
+            feat_map_shape = config.backbone_featmap_shape
+            feature_size = feat_map_shape[-2:]
+            feature_dim = feat_map_shape[1]
+        else:
+            feature_size = (
+                feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
+            )
+            feature_dim = self.backbone.channels[-1]
+
+        self.image_size = image_size
+        self.patch_size = patch_size[0]
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=1)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+
+    def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
+        posemb_tok = posemb[:, :start_index]
+        posemb_grid = posemb[0, start_index:]
+
+        old_grid_size = torch_int(len(posemb_grid) ** 0.5)
+
+        posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2)
+        posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
+        posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1)
+
+        posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+        return posemb
+
+    def forward(
+        self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False
+    ) -> BaseModelOutputWithIntermediateActivations:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+
+        position_embeddings = self._resize_pos_embed(
+            self.position_embeddings, height // self.patch_size, width // self.patch_size
+        )
+
+        backbone_output = self.backbone(pixel_values)
+
+        features = backbone_output.feature_maps[-1]
+
+        # Retrieve also the intermediate activations to use them at later stages
+        output_hidden_states = [backbone_output.feature_maps[index] for index in self.residual_feature_map_index]
+
+        embeddings = self.projection(features).flatten(2).transpose(1, 2)
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + position_embeddings
+
+        # Return hidden states and intermediate activations
+        return BaseModelOutputWithIntermediateActivations(
+            last_hidden_states=embeddings,
+            intermediate_activations=output_hidden_states,
+        )
+
+
+class DPTViTEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = DPTViTPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
+        posemb_tok = posemb[:, :start_index]
+        posemb_grid = posemb[0, start_index:]
+
+        old_grid_size = torch_int(posemb_grid.size(0) ** 0.5)
+
+        posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2)
+        posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
+        posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1)
+
+        posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+        return posemb
+
+    def forward(self, pixel_values: torch.Tensor) -> BaseModelOutputWithIntermediateActivations:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # possibly interpolate position encodings to handle varying image sizes
+        patch_size = self.config.patch_size
+        position_embeddings = self._resize_pos_embed(
+            self.position_embeddings, height // patch_size, width // patch_size
+        )
+
+        embeddings = self.patch_embeddings(pixel_values)
+
+        batch_size, seq_len, _ = embeddings.size()
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return BaseModelOutputWithIntermediateActivations(last_hidden_states=embeddings)
+
+
+class DPTViTPatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding.
+
+    """
+
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DPT
+class DPTSelfAttention(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size = hidden_states.shape[0]
+        new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
+
+        key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
+        query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViTConfig->DPTConfig, ViTSelfOutput->DPTViTSelfOutput
+class DPTViTSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DPTConfig, ViTSelfAttention->DPTSelfAttention, ViTSelfOutput->DPTViTSelfOutput
+class DPTViTAttention(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.attention = DPTSelfAttention(config)
+        self.output = DPTViTSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self_attn_output, _ = self.attention(hidden_states, head_mask)
+        output = self.output(self_attn_output, hidden_states)
+        return output
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViTConfig->DPTConfig, ViTIntermediate->DPTViTIntermediate
+class DPTViTIntermediate(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViTConfig->DPTConfig, ViTOutput->DPTViTOutput
+class DPTViTOutput(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViTConfig->DPTConfig, ViTAttention->DPTViTAttention, ViTIntermediate->DPTViTIntermediate, ViTOutput->DPTViTOutput, ViTLayer->DPTViTLayer
+class DPTViTLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = DPTViTAttention(config)
+        self.intermediate = DPTViTIntermediate(config)
+        self.output = DPTViTOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states_norm = self.layernorm_before(hidden_states)
+        attention_output = self.attention(hidden_states_norm, head_mask)
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        return layer_output
+
+
+# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Encoder with Dinov2Config->DPTConfig, Dinov2->DPTViT
+class DPTViTEncoder(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([DPTViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_hidden_states: bool = False
+    ) -> BaseModelOutput:
+        all_hidden_states = [hidden_states] if output_hidden_states else None
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(hidden_states, layer_head_mask)
+            if all_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
+        )
+
+
+class DPTReassembleStage(nn.Module):
+    """
+    This class reassembles the hidden states of the backbone into image-like feature representations at various
+    resolutions.
+
+    This happens in 3 stages:
+    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
+       `config.readout_type`.
+    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
+    3. Resizing the spatial dimensions (height, width).
+
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.layers = nn.ModuleList()
+        if config.is_hybrid:
+            self._init_reassemble_dpt_hybrid(config)
+        else:
+            self._init_reassemble_dpt(config)
+
+        self.neck_ignore_stages = config.neck_ignore_stages
+
+    def _init_reassemble_dpt_hybrid(self, config):
+        r""" "
+        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
+        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
+        for more details.
+        """
+        for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
+            if i <= 1:
+                self.layers.append(nn.Identity())
+            elif i > 1:
+                self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
+
+        if config.readout_type != "project":
+            raise ValueError(f"Readout type {config.readout_type} is not supported for DPT-Hybrid.")
+
+        # When using DPT-Hybrid the readout type is set to "project". The sanity check is done on the config file
+        self.readout_projects = nn.ModuleList()
+        hidden_size = _get_backbone_hidden_size(config)
+        for i in range(len(config.neck_hidden_sizes)):
+            if i <= 1:
+                self.readout_projects.append(nn.Sequential(nn.Identity()))
+            elif i > 1:
+                self.readout_projects.append(
+                    nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
+                )
+
+    def _init_reassemble_dpt(self, config):
+        for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
+            self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
+
+        if config.readout_type == "project":
+            self.readout_projects = nn.ModuleList()
+            hidden_size = _get_backbone_hidden_size(config)
+            for _ in range(len(config.neck_hidden_sizes)):
+                self.readout_projects.append(
+                    nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
+                )
+
+    def forward(self, hidden_states: list[torch.Tensor], patch_height=None, patch_width=None) -> list[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
+                List of hidden states from the backbone.
+        """
+        out = []
+
+        for i, hidden_state in enumerate(hidden_states):
+            if i not in self.neck_ignore_stages:
+                # reshape to (batch_size, num_channels, height, width)
+                cls_token, hidden_state = hidden_state[:, 0], hidden_state[:, 1:]
+                batch_size, sequence_length, num_channels = hidden_state.shape
+                if patch_height is not None and patch_width is not None:
+                    hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
+                else:
+                    size = torch_int(sequence_length**0.5)
+                    hidden_state = hidden_state.reshape(batch_size, size, size, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+
+                feature_shape = hidden_state.shape
+                if self.config.readout_type == "project":
+                    # reshape to (batch_size, height*width, num_channels)
+                    hidden_state = hidden_state.flatten(2).permute((0, 2, 1))
+                    readout = cls_token.unsqueeze(1).expand_as(hidden_state)
+                    # concatenate the readout token to the hidden states and project
+                    hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1))
+                    # reshape back to (batch_size, num_channels, height, width)
+                    hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape)
+                elif self.config.readout_type == "add":
+                    hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
+                    hidden_state = hidden_state.reshape(feature_shape)
+                hidden_state = self.layers[i](hidden_state)
+            out.append(hidden_state)
+
+        return out
+
+
+def _get_backbone_hidden_size(config):
+    if config.backbone_config is not None and config.is_hybrid is False:
+        return config.backbone_config.hidden_size
+    else:
+        return config.hidden_size
+
+
+class DPTReassembleLayer(nn.Module):
+    def __init__(self, config: DPTConfig, channels: int, factor: int):
+        super().__init__()
+        # projection
+        hidden_size = _get_backbone_hidden_size(config)
+        self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)
+
+        # up/down sampling depending on factor
+        if factor > 1:
+            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
+        elif factor == 1:
+            self.resize = nn.Identity()
+        elif factor < 1:
+            # so should downsample
+            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
+
+    def forward(self, hidden_state):
+        hidden_state = self.projection(hidden_state)
+        hidden_state = self.resize(hidden_state)
+        return hidden_state
+
+
+class DPTFeatureFusionStage(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for _ in range(len(config.neck_hidden_sizes)):
+            self.layers.append(DPTFeatureFusionLayer(config))
+
+    def forward(self, hidden_states):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states, self.layers):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+class DPTPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+class DPTFeatureFusionLayer(nn.Module):
+    """Feature fusion layer, merges feature maps from different stages.
+
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+        align_corners (`bool`, *optional*, defaults to `True`):
+            The align_corner setting for bilinear upsample.
+    """
+
+    def __init__(self, config: DPTConfig, align_corners: bool = True):
+        super().__init__()
+
+        self.align_corners = align_corners
+
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+
+        self.residual_layer1 = DPTPreActResidualLayer(config)
+        self.residual_layer2 = DPTPreActResidualLayer(config)
+
+    def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+
+        hidden_state = self.residual_layer2(hidden_state)
+        hidden_state = nn.functional.interpolate(
+            hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+@auto_docstring
+class DPTPreTrainedModel(PreTrainedModel):
+    config: DPTConfig
+    base_model_prefix = "dpt"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "attentions": DPTSelfAttention,
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, (DPTViTEmbeddings, DPTViTHybridEmbeddings)):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+
+
+@auto_docstring
+class DPTModel(DPTPreTrainedModel):
+    def __init__(self, config: DPTConfig, add_pooling_layer: bool = True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        # vit encoder
+        if config.is_hybrid:
+            self.embeddings = DPTViTHybridEmbeddings(config)
+        else:
+            self.embeddings = DPTViTEmbeddings(config)
+        self.encoder = DPTViTEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = DPTViTPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        if self.config.is_hybrid:
+            return self.embeddings
+        else:
+            return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPoolingAndIntermediateActivations:
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output: BaseModelOutputWithIntermediateActivations = self.embeddings(pixel_values)
+        embedding_last_hidden_states = embedding_output.last_hidden_states
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            embedding_last_hidden_states, head_mask=head_mask, output_hidden_states=output_hidden_states
+        )
+        sequence_output = encoder_outputs.last_hidden_state
+
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        return BaseModelOutputWithPoolingAndIntermediateActivations(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            intermediate_activations=embedding_output.intermediate_activations,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViTConfig->DPTConfig, ViTPooler->DPTViTPooler
+class DPTViTPooler(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class DPTNeck(nn.Module):
+    """
+    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
+    input and produces another list of tensors as output. For DPT, it includes 2 stages:
+
+    * DPTReassembleStage
+    * DPTFeatureFusionStage.
+
+    Args:
+        config (dict): config dict.
+    """
+
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.config = config
+
+        # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT)
+        if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]:
+            self.reassemble_stage = None
+        else:
+            self.reassemble_stage = DPTReassembleStage(config)
+
+        self.convs = nn.ModuleList()
+        for channel in config.neck_hidden_sizes:
+            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
+
+        # fusion
+        self.fusion_stage = DPTFeatureFusionStage(config)
+
+    def forward(
+        self,
+        hidden_states: list[torch.Tensor],
+        patch_height: Optional[int] = None,
+        patch_width: Optional[int] = None,
+    ) -> list[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
+                List of hidden states from the backbone.
+        """
+        if not isinstance(hidden_states, (tuple, list)):
+            raise TypeError("hidden_states should be a tuple or list of tensors")
+
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+
+        # postprocess hidden states
+        if self.reassemble_stage is not None:
+            hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
+
+        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
+
+        # fusion blocks
+        output = self.fusion_stage(features)
+
+        return output
+
+
+class DPTDepthEstimationHead(nn.Module):
+    """
+    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
+    supplementary material).
+    """
+
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+
+        self.config = config
+
+        self.projection = None
+        if config.add_projection:
+            self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+
+        features = config.fusion_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+
+    def forward(self, hidden_states: list[torch.Tensor]) -> torch.Tensor:
+        # use last features
+        hidden_states = hidden_states[self.config.head_in_index]
+
+        if self.projection is not None:
+            hidden_states = self.projection(hidden_states)
+            hidden_states = nn.ReLU()(hidden_states)
+
+        predicted_depth = self.head(hidden_states)
+        predicted_depth = predicted_depth.squeeze(dim=1)
+
+        return predicted_depth
+
+
+@auto_docstring(
+    custom_intro="""
+    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
+    """
+)
+class DPTForDepthEstimation(DPTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.backbone = None
+        if config.is_hybrid is False and (config.backbone_config is not None or config.backbone is not None):
+            self.backbone = load_backbone(config)
+        else:
+            self.dpt = DPTModel(config, add_pooling_layer=False)
+
+        # Neck
+        self.neck = DPTNeck(config)
+
+        # Depth estimation head
+        self.head = DPTDepthEstimationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> DepthEstimatorOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
+        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # interpolate to original size
+        >>> post_processed_output = image_processor.post_process_depth_estimation(
+        ...     outputs,
+        ...     target_sizes=[(image.height, image.width)],
+        ... )
+
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
+        ```"""
+
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        if self.backbone is not None:
+            outputs = self.backbone.forward_with_filtered_kwargs(pixel_values, output_hidden_states=True, **kwargs)
+            hidden_states = outputs.feature_maps
+        else:
+            outputs = self.dpt(pixel_values, head_mask=head_mask, output_hidden_states=True, **kwargs)
+            hidden_states = outputs.hidden_states
+            # only keep certain features based on config.backbone_out_indices
+            # note that the hidden_states also include the initial embeddings
+            if not self.config.is_hybrid:
+                hidden_states = [
+                    feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
+                ]
+            else:
+                backbone_hidden_states = outputs.intermediate_activations
+                backbone_hidden_states.extend(
+                    feature
+                    for idx, feature in enumerate(hidden_states[1:])
+                    if idx in self.config.backbone_out_indices[2:]
+                )
+                hidden_states = backbone_hidden_states
+
+        patch_height, patch_width = None, None
+        if self.config.backbone_config is not None and self.config.is_hybrid is False:
+            _, _, height, width = pixel_values.shape
+            patch_size = self.config.backbone_config.patch_size
+            patch_height = height // patch_size
+            patch_width = width // patch_size
+
+        hidden_states = self.neck(hidden_states, patch_height, patch_width)
+        predicted_depth = self.head(hidden_states)
+
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+class DPTSemanticSegmentationHead(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+
+        self.config = config
+        features = config.fusion_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(features),
+            nn.ReLU(),
+            nn.Dropout(config.semantic_classifier_dropout),
+            nn.Conv2d(features, config.num_labels, kernel_size=1),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
+        )
+
+    def forward(self, hidden_states: list[torch.Tensor]) -> torch.Tensor:
+        # use last features
+        hidden_states = hidden_states[self.config.head_in_index]
+        logits = self.head(hidden_states)
+        return logits
+
+
+class DPTAuxiliaryHead(nn.Module):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+
+        features = config.fusion_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(features),
+            nn.ReLU(),
+            nn.Dropout(0.1, False),
+            nn.Conv2d(features, config.num_labels, kernel_size=1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        logits = self.head(hidden_states)
+        return logits
+
+
+@auto_docstring
+class DPTForSemanticSegmentation(DPTPreTrainedModel):
+    def __init__(self, config: DPTConfig):
+        super().__init__(config)
+
+        self.dpt = DPTModel(config, add_pooling_layer=False)
+
+        # Neck
+        self.neck = DPTNeck(config)
+
+        # Segmentation head(s)
+        self.head = DPTSemanticSegmentationHead(config)
+        self.auxiliary_head = DPTAuxiliaryHead(config) if config.use_auxiliary_head else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> SemanticSegmenterOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
+        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        ```"""
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
+
+        outputs: BaseModelOutputWithPoolingAndIntermediateActivations = self.dpt(
+            pixel_values, head_mask=head_mask, output_hidden_states=True, **kwargs
+        )
+        hidden_states = outputs.hidden_states
+
+        # only keep certain features based on config.backbone_out_indices
+        # note that the hidden_states also include the initial embeddings
+        if not self.config.is_hybrid:
+            hidden_states = [
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
+            ]
+        else:
+            backbone_hidden_states = outputs.intermediate_activations
+            backbone_hidden_states.extend(
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:]
+            )
+
+            hidden_states = backbone_hidden_states
+
+        hidden_states = self.neck(hidden_states=hidden_states)
+        logits = self.head(hidden_states)
+
+        auxiliary_logits = None
+        if self.auxiliary_head is not None:
+            auxiliary_logits = self.auxiliary_head(hidden_states[-1])
+
+        loss = None
+        if labels is not None:
+            # upsample logits to the images' original size
+            upsampled_logits = nn.functional.interpolate(
+                logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+            )
+            if auxiliary_logits is not None:
+                upsampled_auxiliary_logits = nn.functional.interpolate(
+                    auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+                )
+            # compute weighted loss
+            loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+            main_loss = loss_fct(upsampled_logits, labels)
+            auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
+            loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["DPTForDepthEstimation", "DPTForSemanticSegmentation", "DPTModel", "DPTPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/dpt/modular_dpt.py b/phivenv/Lib/site-packages/transformers/models/dpt/modular_dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c74b4c570ae57785068333c0cbdfc61d12b4e50
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/dpt/modular_dpt.py
@@ -0,0 +1,310 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+from ...image_processing_base import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_transforms import group_images_by_shape, reorder_images
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    PILImageResampling,
+    SizeDict,
+)
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    requires_backends,
+)
+from ..beit.image_processing_beit_fast import BeitImageProcessorFast
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+def get_resize_output_image_size(
+    input_image: "torch.Tensor",
+    output_size: Union[int, Iterable[int]],
+    keep_aspect_ratio: bool,
+    multiple: int,
+) -> SizeDict:
+    def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
+        x = round(val / multiple) * multiple
+
+        if max_val is not None and x > max_val:
+            x = math.floor(val / multiple) * multiple
+
+        if x < min_val:
+            x = math.ceil(val / multiple) * multiple
+
+        return x
+
+    input_height, input_width = input_image.shape[-2:]
+    output_height, output_width = output_size
+
+    # determine new height and width
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            # fit width
+            scale_height = scale_width
+        else:
+            # fit height
+            scale_width = scale_height
+
+    new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
+
+    return SizeDict(height=new_height, width=new_width)
+
+
+class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    ensure_multiple_of (`int`, *optional*, defaults to 1):
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+        by `ensure_multiple_of` in `preprocess`.
+    do_pad (`bool`, *optional*, defaults to `False`):
+        Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
+        combination with DPT.
+    size_divisor (`int`, *optional*):
+        If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
+        DINOv2 paper, which uses the model in combination with DPT.
+    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
+        be overidden by `keep_aspect_ratio` in `preprocess`.
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    ensure_multiple_of: Optional[int]
+    size_divisor: Optional[int]
+    do_pad: Optional[bool]
+    keep_aspect_ratio: Optional[bool]
+    do_reduce_labels: Optional[bool]
+
+
+@auto_docstring
+class DPTImageProcessorFast(BeitImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 384, "width": 384}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = False
+    rescale_factor = 1 / 255
+    ensure_multiple_of = 1
+    keep_aspect_ratio = False
+    do_reduce_labels = False
+    crop_size = None
+    do_center_crop = None
+    do_reduce_labels = None
+
+    valid_kwargs = DPTFastImageProcessorKwargs
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        ensure_multiple_of: Optional[int] = 1,
+        keep_aspect_ratio: bool = False,
+    ) -> "torch.Tensor":
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to use antialiasing when resizing the image
+            ensure_multiple_of (`int`, *optional*):
+                If `do_resize` is `True`, the image is resized to a size that is a multiple of this value
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, and `do_resize` is `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        if not size.height or not size.width:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+
+        output_size = get_resize_output_image_size(
+            image,
+            output_size=(size.height, size.width),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+        )
+        return BaseImageProcessorFast.resize(
+            self, image, output_size, interpolation=interpolation, antialias=antialias
+        )
+
+    def pad_image(
+        self,
+        image: "torch.Tensor",
+        size_divisor: int = 1,
+    ) -> "torch.Tensor":
+        r"""
+        Center pad a batch of images to be a multiple of `size_divisor`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to pad.  Can be a batch of images of dimensions (N, C, H, W) or a single image of dimensions (C, H, W).
+            size_divisor (`int`):
+                The width and height of the image will be padded to a multiple of this number.
+        """
+        height, width = image.shape[-2:]
+
+        def _get_pad(size, size_divisor):
+            new_size = math.ceil(size / size_divisor) * size_divisor
+            pad_size = new_size - size
+            pad_size_left = pad_size // 2
+            pad_size_right = pad_size - pad_size_left
+            return pad_size_left, pad_size_right
+
+        pad_top, pad_bottom = _get_pad(height, size_divisor)
+        pad_left, pad_right = _get_pad(width, size_divisor)
+        padding = (pad_left, pad_top, pad_right, pad_bottom)
+        return F.pad(image, padding)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_reduce_labels: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        keep_aspect_ratio: bool,
+        ensure_multiple_of: Optional[int],
+        do_pad: bool,
+        size_divisor: Optional[int],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        if do_reduce_labels:
+            images = self.reduce_label(images)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation,
+                    ensure_multiple_of=ensure_multiple_of,
+                    keep_aspect_ratio=keep_aspect_ratio,
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            if do_pad:
+                stacked_images = self.pad_image(stacked_images, size_divisor)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(data={"pixel_values": processed_images})
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
+    ) -> list[dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                depth = torch.nn.functional.interpolate(
+                    depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
+                ).squeeze()
+
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["DPTImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/__init__.py b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ded8084c320035afa9f1ba7108c74f2721f0d8d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_efficientloftr import *
+    from .image_processing_efficientloftr import *
+    from .modeling_efficientloftr import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..619258ac5b2a6599e7e6290a44efa03815f8a118
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc5de28114a98ea378bce885ba4aa602f036f75d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..753d411393497bb53334af7411acda14f227d0cd
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72e0305b1d71610c3e8ce41f89c5742c7d0e7af9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py b/phivenv/Lib/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e7a84d7f53995afdd3398acaf178b1d310bd738
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py
@@ -0,0 +1,199 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class EfficientLoFTRConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EffientLoFTRFromKeypointMatching`].
+    It is used to instantiate a EfficientLoFTR model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    EfficientLoFTR [zju-community/efficientloftr](https://huggingface.co/zju-community/efficientloftr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        stage_num_blocks (`List`, *optional*, defaults to [1, 2, 4, 14]):
+            The number of blocks in each stages
+        out_features (`List`, *optional*, defaults to [64, 64, 128, 256]):
+            The number of channels in each stage
+        stage_stride (`List`, *optional*, defaults to [2, 1, 2, 2]):
+            The stride used in each stage
+        hidden_size (`int`, *optional*, defaults to 256):
+            The dimension of the descriptors.
+        activation_function (`str`, *optional*, defaults to `"relu"`):
+            The activation function used in the backbone
+        q_aggregation_kernel_size (`int`, *optional*, defaults to 4):
+            The kernel size of the aggregation of query states in the fusion network
+        kv_aggregation_kernel_size (`int`, *optional*, defaults to 4):
+            The kernel size of the aggregation of key and value states in the fusion network
+        q_aggregation_stride (`int`, *optional*, defaults to 4):
+            The stride of the aggregation of query states in the fusion network
+        kv_aggregation_stride (`int`, *optional*, defaults to 4):
+            The stride of the aggregation of key and value states in the fusion network
+        num_attention_layers (`int`, *optional*, defaults to 4):
+            Number of attention layers in the LocalFeatureTransformer
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            The number of heads in the GNN layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during attention.
+        mlp_activation_function (`str`, *optional*, defaults to `"leaky_relu"`):
+            Activation function used in the attention mlp layer.
+        coarse_matching_skip_softmax (`bool`, *optional*, defaults to `False`):
+            Whether to skip softmax or not at the coarse matching step.
+        coarse_matching_threshold (`float`, *optional*, defaults to 0.2):
+            The threshold for the minimum score required for a match.
+        coarse_matching_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature to apply to the coarse similarity matrix
+        coarse_matching_border_removal (`int`, *optional*, defaults to 2):
+            The size of the border to remove during coarse matching
+        fine_kernel_size (`int`, *optional*, defaults to 8):
+            Kernel size used for the fine feature matching
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        partial_rotary_factor (`float`, *optional*, defaults to 4.0):
+            Dim factor for the RoPE embeddings, in EfficientLoFTR, frequencies should be generated for
+            the whole hidden_size, so this factor is used to compensate.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3', '2d'], with 'default' being the original RoPE implementation.
+                `dim` (`int`): The dimension of the RoPE embeddings.
+        fine_matching_slice_dim (`int`, *optional*, defaults to 8):
+            The size of the slice used to divide the fine features for the first and second fine matching stages.
+        fine_matching_regress_temperature (`float`, *optional*, defaults to 10.0):
+            The temperature to apply to the fine similarity matrix
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Examples:
+        ```python
+        >>> from transformers import EfficientLoFTRConfig, EfficientLoFTRForKeypointMatching
+
+        >>> # Initializing a EfficientLoFTR configuration
+        >>> configuration = EfficientLoFTRConfig()
+
+        >>> # Initializing a model from the EfficientLoFTR configuration
+        >>> model = EfficientLoFTRForKeypointMatching(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```
+    """
+
+    model_type = "efficientloftr"
+
+    def __init__(
+        self,
+        stage_num_blocks: Optional[list[int]] = None,
+        out_features: Optional[list[int]] = None,
+        stage_stride: Optional[list[int]] = None,
+        hidden_size: int = 256,
+        activation_function: str = "relu",
+        q_aggregation_kernel_size: int = 4,
+        kv_aggregation_kernel_size: int = 4,
+        q_aggregation_stride: int = 4,
+        kv_aggregation_stride: int = 4,
+        num_attention_layers: int = 4,
+        num_attention_heads: int = 8,
+        attention_dropout: float = 0.0,
+        attention_bias: bool = False,
+        mlp_activation_function: str = "leaky_relu",
+        coarse_matching_skip_softmax: bool = False,
+        coarse_matching_threshold: float = 0.2,
+        coarse_matching_temperature: float = 0.1,
+        coarse_matching_border_removal: int = 2,
+        fine_kernel_size: int = 8,
+        batch_norm_eps: float = 1e-5,
+        rope_theta: float = 10000.0,
+        partial_rotary_factor: float = 4.0,
+        rope_scaling: Optional[dict] = None,
+        fine_matching_slice_dim: int = 8,
+        fine_matching_regress_temperature: float = 10.0,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        # Stage level of RepVGG
+        self.stage_num_blocks = stage_num_blocks if stage_num_blocks is not None else [1, 2, 4, 14]
+        self.stage_stride = stage_stride if stage_stride is not None else [2, 1, 2, 2]
+        self.out_features = out_features if out_features is not None else [64, 64, 128, 256]
+        self.stage_in_channels = [1] + self.out_features[:-1]
+
+        # Block level of RepVGG
+        self.stage_block_stride = [
+            [stride] + [1] * (num_blocks - 1) for stride, num_blocks in zip(self.stage_stride, self.stage_num_blocks)
+        ]
+        self.stage_block_out_channels = [
+            [self.out_features[stage_idx]] * num_blocks for stage_idx, num_blocks in enumerate(self.stage_num_blocks)
+        ]
+        self.stage_block_in_channels = [
+            [self.stage_in_channels[stage_idx]] + self.stage_block_out_channels[stage_idx][:-1]
+            for stage_idx in range(len(self.stage_num_blocks))
+        ]
+
+        # Fine matching level of EfficientLoFTR
+        self.fine_fusion_dims = list(reversed(self.out_features))[:-1]
+
+        self.hidden_size = hidden_size
+        if self.hidden_size != self.out_features[-1]:
+            raise ValueError(
+                f"hidden_size should be equal to the last value in out_features. hidden_size = {self.hidden_size}, out_features = {self.out_features[-1]}"
+            )
+
+        self.activation_function = activation_function
+        self.q_aggregation_kernel_size = q_aggregation_kernel_size
+        self.kv_aggregation_kernel_size = kv_aggregation_kernel_size
+        self.q_aggregation_stride = q_aggregation_stride
+        self.kv_aggregation_stride = kv_aggregation_stride
+        self.num_attention_layers = num_attention_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+        self.attention_bias = attention_bias
+        self.intermediate_size = self.hidden_size * 2
+        self.mlp_activation_function = mlp_activation_function
+        self.coarse_matching_skip_softmax = coarse_matching_skip_softmax
+        self.coarse_matching_threshold = coarse_matching_threshold
+        self.coarse_matching_temperature = coarse_matching_temperature
+        self.coarse_matching_border_removal = coarse_matching_border_removal
+        self.fine_kernel_size = fine_kernel_size
+        self.batch_norm_eps = batch_norm_eps
+        self.fine_matching_slice_dim = fine_matching_slice_dim
+        self.fine_matching_regress_temperature = fine_matching_regress_temperature
+
+        self.num_key_value_heads = num_attention_heads
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling if rope_scaling is not None else {"rope_type": "default"}
+
+        # for compatibility with "default" rope type
+        self.partial_rotary_factor = partial_rotary_factor
+        rope_config_validation(self)
+
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["EfficientLoFTRConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py b/phivenv/Lib/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py
new file mode 100644
index 0000000000000000000000000000000000000000..32a351ea2355933de10fe622e9896ca6ba6291df
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py
@@ -0,0 +1,461 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for SuperPoint."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ... import is_torch_available, is_vision_available
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_type,
+    infer_channel_dimension_format,
+    is_pil_image,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, logging, requires_backends
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+    from PIL import Image, ImageDraw
+
+    from .modeling_efficientloftr import KeypointMatchingOutput
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
+def is_grayscale(
+    image: np.ndarray,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
+    if input_data_format == ChannelDimension.FIRST:
+        if image.shape[0] == 1:
+            return True
+        return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
+    elif input_data_format == ChannelDimension.LAST:
+        if image.shape[-1] == 1:
+            return True
+        return np.all(image[..., 0] == image[..., 1]) and np.all(image[..., 1] == image[..., 2])
+
+
+# Copied from transformers.models.superpoint.image_processing_superpoint.convert_to_grayscale
+def convert_to_grayscale(
+    image: ImageInput,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> ImageInput:
+    """
+    Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. TODO support torch
+    and tensorflow grayscale conversion
+
+    This function is supposed to return a 1-channel image, but it returns a 3-channel image with the same value in each
+    channel, because of an issue that is discussed in :
+    https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446
+
+    Args:
+        image (Image):
+            The image to convert.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format for the input image.
+    """
+    requires_backends(convert_to_grayscale, ["vision"])
+
+    if isinstance(image, np.ndarray):
+        if is_grayscale(image, input_data_format=input_data_format):
+            return image
+        if input_data_format == ChannelDimension.FIRST:
+            gray_image = image[0, ...] * 0.2989 + image[1, ...] * 0.5870 + image[2, ...] * 0.1140
+            gray_image = np.stack([gray_image] * 3, axis=0)
+        elif input_data_format == ChannelDimension.LAST:
+            gray_image = image[..., 0] * 0.2989 + image[..., 1] * 0.5870 + image[..., 2] * 0.1140
+            gray_image = np.stack([gray_image] * 3, axis=-1)
+        return gray_image
+
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    image = image.convert("L")
+    return image
+
+
+# Copied from transformers.models.superglue.image_processing_superglue.validate_and_format_image_pairs
+def validate_and_format_image_pairs(images: ImageInput):
+    error_message = (
+        "Input images must be a one of the following :",
+        " - A pair of PIL images.",
+        " - A pair of 3D arrays.",
+        " - A list of pairs of PIL images.",
+        " - A list of pairs of 3D arrays.",
+    )
+
+    def _is_valid_image(image):
+        """images is a PIL Image or a 3D array."""
+        return is_pil_image(image) or (
+            is_valid_image(image) and get_image_type(image) != ImageType.PIL and len(image.shape) == 3
+        )
+
+    if isinstance(images, list):
+        if len(images) == 2 and all((_is_valid_image(image)) for image in images):
+            return images
+        if all(
+            isinstance(image_pair, list)
+            and len(image_pair) == 2
+            and all(_is_valid_image(image) for image in image_pair)
+            for image_pair in images
+        ):
+            return [image for image_pair in images for image in image_pair]
+    raise ValueError(error_message)
+
+
+class EfficientLoFTRImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EfficientLoFTR image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
+            by `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
+            Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to
+            `True`. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_grayscale (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_grayscale: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 480, "width": 640}
+        size = get_size_dict(size, default_to_square=False)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_grayscale = do_grayscale
+
+    # Copied from transformers.models.superpoint.image_processing_superpoint.SuperPointImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary of the form `{"height": int, "width": int}`, specifying the size of the output image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be inferred from the input
+                image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        size = get_size_dict(size, default_to_square=False)
+
+        return resize(
+            image,
+            size=(size["height"], size["width"]),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.superglue.image_processing_superglue.SuperGlueImageProcessor.preprocess
+    def preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_grayscale: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image pairs to preprocess. Expects either a list of 2 images or a list of list of 2 images list with
+                pixel values ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set
+                `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the output image after `resize` has been applied. If `size["shortest_edge"]` >= 384, the image
+                is resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the
+                image will be matched to `int(size["shortest_edge"]/ crop_pct)`, after which the image is cropped to
+                `(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of `PILImageResampling`, filters. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
+                Whether to convert the image to grayscale.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_grayscale = do_grayscale if do_grayscale is not None else self.do_grayscale
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        # Validate and convert the input images into a flattened list of images for all subsequent processing steps.
+        images = validate_and_format_image_pairs(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_grayscale:
+                image = convert_to_grayscale(image, input_data_format=input_data_format)
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            all_images.append(image)
+
+        # Convert back the flattened list of images into a list of pairs of images.
+        image_pairs = [all_images[i : i + 2] for i in range(0, len(all_images), 2)]
+
+        data = {"pixel_values": image_pairs}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_keypoint_matching(
+        self,
+        outputs: "KeypointMatchingOutput",
+        target_sizes: Union[TensorType, list[tuple]],
+        threshold: float = 0.0,
+    ) -> list[dict[str, torch.Tensor]]:
+        """
+        Converts the raw output of [`KeypointMatchingOutput`] into lists of keypoints, scores and descriptors
+        with coordinates absolute to the original image sizes.
+        Args:
+            outputs ([`KeypointMatchingOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` or `List[Tuple[Tuple[int, int]]]`, *optional*):
+                Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`Tuple[int, int]`) containing the
+                target size `(height, width)` of each image in the batch. This must be the original image size (before
+                any processing).
+            threshold (`float`, *optional*, defaults to 0.0):
+                Threshold to filter out the matches with low scores.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image
+            of the pair, the matching scores and the matching indices.
+        """
+        if outputs.matches.shape[0] != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")
+        if not all(len(target_size) == 2 for target_size in target_sizes):
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        if isinstance(target_sizes, list):
+            image_pair_sizes = torch.tensor(target_sizes, device=outputs.matches.device)
+        else:
+            if target_sizes.shape[1] != 2 or target_sizes.shape[2] != 2:
+                raise ValueError(
+                    "Each element of target_sizes must contain the size (h, w) of each image of the batch"
+                )
+            image_pair_sizes = target_sizes
+
+        keypoints = outputs.keypoints.clone()
+        keypoints = keypoints * image_pair_sizes.flip(-1).reshape(-1, 2, 1, 2)
+        keypoints = keypoints.to(torch.int32)
+
+        results = []
+        for keypoints_pair, matches, scores in zip(keypoints, outputs.matches, outputs.matching_scores):
+            # Filter out matches with low scores
+            valid_matches = torch.logical_and(scores > threshold, matches > -1)
+
+            matched_keypoints0 = keypoints_pair[0][valid_matches[0]]
+            matched_keypoints1 = keypoints_pair[1][valid_matches[1]]
+            matching_scores = scores[0][valid_matches[0]]
+
+            results.append(
+                {
+                    "keypoints0": matched_keypoints0,
+                    "keypoints1": matched_keypoints1,
+                    "matching_scores": matching_scores,
+                }
+            )
+
+        return results
+
+    def visualize_keypoint_matching(
+        self,
+        images: ImageInput,
+        keypoint_matching_output: list[dict[str, torch.Tensor]],
+    ) -> list["Image.Image"]:
+        """
+        Plots the image pairs side by side with the detected keypoints as well as the matching between them.
+
+        Args:
+            images (`ImageInput`):
+                Image pairs to plot. Same as `EfficientLoFTRImageProcessor.preprocess`. Expects either a list of 2
+                images or a list of list of 2 images list with pixel values ranging from 0 to 255.
+            keypoint_matching_output (List[Dict[str, torch.Tensor]]]):
+                A post processed keypoint matching output
+
+        Returns:
+            `List[PIL.Image.Image]`: A list of PIL images, each containing the image pairs side by side with the detected
+            keypoints as well as the matching between them.
+        """
+        images = validate_and_format_image_pairs(images)
+        images = [to_numpy_array(image) for image in images]
+        image_pairs = [images[i : i + 2] for i in range(0, len(images), 2)]
+
+        results = []
+        for image_pair, pair_output in zip(image_pairs, keypoint_matching_output):
+            height0, width0 = image_pair[0].shape[:2]
+            height1, width1 = image_pair[1].shape[:2]
+            plot_image = np.zeros((max(height0, height1), width0 + width1, 3), dtype=np.uint8)
+            plot_image[:height0, :width0] = image_pair[0]
+            plot_image[:height1, width0:] = image_pair[1]
+
+            plot_image_pil = Image.fromarray(plot_image)
+            draw = ImageDraw.Draw(plot_image_pil)
+
+            keypoints0_x, keypoints0_y = pair_output["keypoints0"].unbind(1)
+            keypoints1_x, keypoints1_y = pair_output["keypoints1"].unbind(1)
+            for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip(
+                keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, pair_output["matching_scores"]
+            ):
+                color = self._get_color(matching_score)
+                draw.line(
+                    (keypoint0_x, keypoint0_y, keypoint1_x + width0, keypoint1_y),
+                    fill=color,
+                    width=3,
+                )
+                draw.ellipse((keypoint0_x - 2, keypoint0_y - 2, keypoint0_x + 2, keypoint0_y + 2), fill="black")
+                draw.ellipse(
+                    (keypoint1_x + width0 - 2, keypoint1_y - 2, keypoint1_x + width0 + 2, keypoint1_y + 2),
+                    fill="black",
+                )
+
+            results.append(plot_image_pil)
+        return results
+
+    def _get_color(self, score):
+        """Maps a score to a color."""
+        r = int(255 * (1 - score))
+        g = int(255 * score)
+        b = 0
+        return (r, g, b)
+
+
+__all__ = ["EfficientLoFTRImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py b/phivenv/Lib/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5a20a7cc87dea233db5f4ddc3313df44f72c9d5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py
@@ -0,0 +1,1325 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2CLS, ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import (
+    ModelOutput,
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    torch_int,
+)
+from ...utils.generic import check_model_inputs
+from .configuration_efficientloftr import EfficientLoFTRConfig
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of keypoint matching models. Due to the nature of keypoint detection and matching, the number
+    of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the batch of
+    images, the maximum number of matches is set as the dimension of the matches and matching scores. The mask tensor is
+    used to indicate which values in the keypoints, matches and matching_scores tensors are keypoint matching
+    information.
+    """
+)
+class KeypointMatchingOutput(ModelOutput):
+    r"""
+    matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
+        Index of keypoint matched in the other image.
+    matching_scores (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
+        Scores of predicted matches.
+    keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
+        Absolute (x, y) coordinates of predicted keypoints in a given image.
+    hidden_states (`tuple[torch.FloatTensor, ...]`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, 2, num_channels,
+        num_keypoints)`, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`)
+    attentions (`tuple[torch.FloatTensor, ...]`, *optional*):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, 2, num_heads, num_keypoints,
+        num_keypoints)`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`)
+    """
+
+    matches: Optional[torch.FloatTensor] = None
+    matching_scores: Optional[torch.FloatTensor] = None
+    keypoints: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@compile_compatible_method_lru_cache(maxsize=32)
+def compute_embeddings(inv_freq: torch.Tensor, embed_height: int, embed_width: int, hidden_size: int) -> torch.Tensor:
+    i_indices = torch.ones(embed_height, embed_width, dtype=inv_freq.dtype, device=inv_freq.device)
+    j_indices = torch.ones(embed_height, embed_width, dtype=inv_freq.dtype, device=inv_freq.device)
+    i_indices = i_indices.cumsum(0).unsqueeze(-1)
+    j_indices = j_indices.cumsum(1).unsqueeze(-1)
+
+    emb = torch.zeros(1, embed_height, embed_width, hidden_size // 2, dtype=inv_freq.dtype, device=inv_freq.device)
+    emb[:, :, :, 0::2] = i_indices * inv_freq
+    emb[:, :, :, 1::2] = j_indices * inv_freq
+
+    return emb
+
+
+class EfficientLoFTRRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: EfficientLoFTRConfig, device=None):
+        super().__init__()
+        self.config = config
+        self.rope_type = config.rope_scaling["rope_type"]
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, _ = self.rope_init_fn(self.config, device)
+        inv_freq_expanded = inv_freq[None, None, None, :].float().expand(1, 1, 1, -1)
+
+        self.register_buffer("inv_freq", inv_freq_expanded, persistent=False)
+
+    @torch.no_grad()
+    def forward(
+        self, x: torch.Tensor, position_ids: Optional[tuple[torch.LongTensor, torch.LongTensor]] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        feats_height, feats_width = x.shape[-2:]
+        embed_height = (feats_height - self.config.q_aggregation_kernel_size) // self.config.q_aggregation_stride + 1
+        embed_width = (feats_width - self.config.q_aggregation_kernel_size) // self.config.q_aggregation_stride + 1
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            emb = compute_embeddings(self.inv_freq, embed_height, embed_width, self.config.hidden_size)
+            sin = emb.sin()
+            cos = emb.cos()
+
+        sin = sin.repeat_interleave(2, dim=-1)
+        cos = cos.repeat_interleave(2, dim=-1)
+
+        sin = sin.to(device=x.device, dtype=x.dtype)
+        cos = cos.to(device=x.device, dtype=x.dtype)
+
+        return cos, sin
+
+
+# Copied from transformers.models.rt_detr_v2.modeling_rt_detr_v2.RTDetrV2ConvNormLayer with RTDetrV2->EfficientLoFTR
+class EfficientLoFTRConvNormLayer(nn.Module):
+    def __init__(self, config, in_channels, out_channels, kernel_size, stride, padding=None, activation=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+        self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.norm(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class EfficientLoFTRRepVGGBlock(GradientCheckpointingLayer):
+    """
+    RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
+    """
+
+    def __init__(self, config: EfficientLoFTRConfig, stage_idx: int, block_idx: int):
+        super().__init__()
+        in_channels = config.stage_block_in_channels[stage_idx][block_idx]
+        out_channels = config.stage_block_out_channels[stage_idx][block_idx]
+        stride = config.stage_block_stride[stage_idx][block_idx]
+        activation = config.activation_function
+        self.conv1 = EfficientLoFTRConvNormLayer(
+            config, in_channels, out_channels, kernel_size=3, stride=stride, padding=1
+        )
+        self.conv2 = EfficientLoFTRConvNormLayer(
+            config, in_channels, out_channels, kernel_size=1, stride=stride, padding=0
+        )
+        self.identity = nn.BatchNorm2d(in_channels) if in_channels == out_channels and stride == 1 else None
+        self.activation = nn.Identity() if activation is None else ACT2FN[activation]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.identity is not None:
+            identity_out = self.identity(hidden_states)
+        else:
+            identity_out = 0
+        hidden_states = self.conv1(hidden_states) + self.conv2(hidden_states) + identity_out
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class EfficientLoFTRRepVGGStage(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig, stage_idx: int):
+        super().__init__()
+        self.blocks = nn.ModuleList([])
+        for block_idx in range(config.stage_num_blocks[stage_idx]):
+            self.blocks.append(
+                EfficientLoFTRRepVGGBlock(
+                    config,
+                    stage_idx,
+                    block_idx,
+                )
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class EfficientLoFTRepVGG(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+
+        self.stages = nn.ModuleList([])
+
+        for stage_idx in range(len(config.stage_stride)):
+            stage = EfficientLoFTRRepVGGStage(config, stage_idx)
+            self.stages.append(stage)
+
+    def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]:
+        outputs = []
+        for stage in self.stages:
+            hidden_states = stage(hidden_states)
+            outputs.append(hidden_states)
+
+        # Exclude first stage in outputs
+        outputs = outputs[1:]
+        return outputs
+
+
+class EfficientLoFTRAggregationLayer(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+
+        hidden_size = config.hidden_size
+
+        self.q_aggregation = nn.Conv2d(
+            hidden_size,
+            hidden_size,
+            kernel_size=config.q_aggregation_kernel_size,
+            padding=0,
+            stride=config.q_aggregation_stride,
+            bias=False,
+            groups=hidden_size,
+        )
+        self.kv_aggregation = torch.nn.MaxPool2d(
+            kernel_size=config.kv_aggregation_kernel_size, stride=config.kv_aggregation_stride
+        )
+        self.norm = nn.LayerNorm(hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        query_states = hidden_states
+        is_cross_attention = encoder_hidden_states is not None
+        kv_states = encoder_hidden_states if is_cross_attention else hidden_states
+
+        query_states = self.q_aggregation(query_states)
+        kv_states = self.kv_aggregation(kv_states)
+        query_states = query_states.permute(0, 2, 3, 1)
+        kv_states = kv_states.permute(0, 2, 3, 1)
+        hidden_states = self.norm(query_states)
+        encoder_hidden_states = self.norm(kv_states)
+        return hidden_states, encoder_hidden_states
+
+
+# Copied from transformers.models.cohere.modeling_cohere.rotate_half
+def rotate_half(x):
+    # Split and rotate. Note that this function is different from e.g. Llama.
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
+    return rot_x
+
+
+# Copied from transformers.models.cohere.modeling_cohere.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    dtype = q.dtype
+    q = q.float()
+    k = k.float()
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
+
+
+# Copied from transformers.models.cohere.modeling_cohere.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.llama.modeling_llama.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class EfficientLoFTRAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: EfficientLoFTRConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        batch_size, seq_len, dim = hidden_states.shape
+        input_shape = hidden_states.shape[:-1]
+
+        query_states = self.q_proj(hidden_states).view(batch_size, seq_len, -1, dim)
+
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+
+        key_states = self.k_proj(current_states).view(batch_size, seq_len, -1, dim)
+        value_states = self.v_proj(current_states).view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=2)
+
+        query_states = query_states.view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=None,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class EfficientLoFTRMLP(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        self.fc1 = nn.Linear(hidden_size * 2, intermediate_size, bias=False)
+        self.activation = ACT2FN[config.mlp_activation_function]
+        self.fc2 = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class EfficientLoFTRAggregatedAttention(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig, layer_idx: int):
+        super().__init__()
+
+        self.q_aggregation_kernel_size = config.q_aggregation_kernel_size
+        self.aggregation = EfficientLoFTRAggregationLayer(config)
+        self.attention = EfficientLoFTRAttention(config, layer_idx)
+        self.mlp = EfficientLoFTRMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        batch_size, embed_dim, _, _ = hidden_states.shape
+
+        # Aggregate features
+        aggregated_hidden_states, aggregated_encoder_hidden_states = self.aggregation(
+            hidden_states, encoder_hidden_states
+        )
+        _, aggregated_h, aggregated_w, _ = aggregated_hidden_states.shape
+
+        # Multi-head attention
+        aggregated_hidden_states = aggregated_hidden_states.reshape(batch_size, -1, embed_dim)
+        aggregated_encoder_hidden_states = aggregated_encoder_hidden_states.reshape(batch_size, -1, embed_dim)
+        attn_output, _ = self.attention(
+            aggregated_hidden_states,
+            aggregated_encoder_hidden_states,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        # Upsample features
+        # (batch_size, seq_len, embed_dim) -> (batch_size, embed_dim, h, w) with seq_len = h * w
+        attn_output = attn_output.permute(0, 2, 1)
+        attn_output = attn_output.reshape(batch_size, embed_dim, aggregated_h, aggregated_w)
+        attn_output = torch.nn.functional.interpolate(
+            attn_output, scale_factor=self.q_aggregation_kernel_size, mode="bilinear", align_corners=False
+        )
+        intermediate_states = torch.cat([hidden_states, attn_output], dim=1)
+        intermediate_states = intermediate_states.permute(0, 2, 3, 1)
+        output_states = self.mlp(intermediate_states)
+        output_states = output_states.permute(0, 3, 1, 2)
+
+        hidden_states = hidden_states + output_states
+
+        return hidden_states
+
+
+class EfficientLoFTRLocalFeatureTransformerLayer(GradientCheckpointingLayer):
+    def __init__(self, config: EfficientLoFTRConfig, layer_idx: int):
+        super().__init__()
+
+        self.self_attention = EfficientLoFTRAggregatedAttention(config, layer_idx)
+        self.cross_attention = EfficientLoFTRAggregatedAttention(config, layer_idx)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        batch_size, _, embed_dim, height, width = hidden_states.shape
+
+        hidden_states = hidden_states.reshape(-1, embed_dim, height, width)
+        hidden_states = self.self_attention(hidden_states, position_embeddings=position_embeddings, **kwargs)
+
+        ###
+        # Implementation of a bug in the original implementation regarding the cross-attention
+        # See : https://github.com/zju3dv/MatchAnything/issues/26
+        hidden_states = hidden_states.reshape(-1, 2, embed_dim, height, width)
+        features_0 = hidden_states[:, 0]
+        features_1 = hidden_states[:, 1]
+        features_0 = self.cross_attention(features_0, features_1, **kwargs)
+        features_1 = self.cross_attention(features_1, features_0, **kwargs)
+        hidden_states = torch.stack((features_0, features_1), dim=1)
+        ###
+
+        return hidden_states
+
+
+class EfficientLoFTRLocalFeatureTransformer(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                EfficientLoFTRLocalFeatureTransformerLayer(config, layer_idx=i)
+                for i in range(config.num_attention_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, position_embeddings=position_embeddings, **kwargs)
+        return hidden_states
+
+
+class EfficientLoFTROutConvBlock(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig, hidden_size: int, intermediate_size: int):
+        super().__init__()
+
+        self.out_conv1 = nn.Conv2d(hidden_size, intermediate_size, kernel_size=1, stride=1, padding=0, bias=False)
+        self.out_conv2 = nn.Conv2d(
+            intermediate_size, intermediate_size, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.batch_norm = nn.BatchNorm2d(intermediate_size)
+        self.activation = ACT2CLS[config.mlp_activation_function]()
+        self.out_conv3 = nn.Conv2d(intermediate_size, hidden_size, kernel_size=3, stride=1, padding=1, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor, residual_states: torch.Tensor) -> torch.Tensor:
+        residual_states = self.out_conv1(residual_states)
+        residual_states = residual_states + hidden_states
+        residual_states = self.out_conv2(residual_states)
+        residual_states = self.batch_norm(residual_states)
+        residual_states = self.activation(residual_states)
+        residual_states = self.out_conv3(residual_states)
+        residual_states = nn.functional.interpolate(
+            residual_states, scale_factor=2.0, mode="bilinear", align_corners=False
+        )
+        return residual_states
+
+
+class EfficientLoFTRFineFusionLayer(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+
+        self.fine_kernel_size = config.fine_kernel_size
+
+        fine_fusion_dims = config.fine_fusion_dims
+        self.out_conv = nn.Conv2d(
+            fine_fusion_dims[0], fine_fusion_dims[0], kernel_size=1, stride=1, padding=0, bias=False
+        )
+        self.out_conv_layers = nn.ModuleList()
+        for i in range(1, len(fine_fusion_dims)):
+            out_conv = EfficientLoFTROutConvBlock(config, fine_fusion_dims[i], fine_fusion_dims[i - 1])
+            self.out_conv_layers.append(out_conv)
+
+    def forward_pyramid(
+        self,
+        hidden_states: torch.Tensor,
+        residual_states: list[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.out_conv(hidden_states)
+        hidden_states = nn.functional.interpolate(
+            hidden_states, scale_factor=2.0, mode="bilinear", align_corners=False
+        )
+        for i, layer in enumerate(self.out_conv_layers):
+            hidden_states = layer(hidden_states, residual_states[i])
+
+        return hidden_states
+
+    def forward(
+        self,
+        coarse_features: torch.Tensor,
+        residual_features: list[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        For each image pair, compute the fine features of pixels.
+        In both images, compute a patch of fine features center cropped around each coarse pixel.
+        In the first image, the feature patch is kernel_size large and long.
+        In the second image, it is (kernel_size + 2) large and long.
+        """
+        batch_size, _, embed_dim, coarse_height, coarse_width = coarse_features.shape
+
+        coarse_features = coarse_features.reshape(-1, embed_dim, coarse_height, coarse_width)
+        residual_features = list(reversed(residual_features))
+
+        # 1. Fine feature extraction
+        fine_features = self.forward_pyramid(coarse_features, residual_features)
+        _, fine_embed_dim, fine_height, fine_width = fine_features.shape
+
+        fine_features = fine_features.reshape(batch_size, 2, fine_embed_dim, fine_height, fine_width)
+        fine_features_0 = fine_features[:, 0]
+        fine_features_1 = fine_features[:, 1]
+
+        # 2. Unfold all local windows in crops
+        stride = int(fine_height // coarse_height)
+        fine_features_0 = nn.functional.unfold(
+            fine_features_0, kernel_size=self.fine_kernel_size, stride=stride, padding=0
+        )
+        _, _, seq_len = fine_features_0.shape
+        fine_features_0 = fine_features_0.reshape(batch_size, -1, self.fine_kernel_size**2, seq_len)
+        fine_features_0 = fine_features_0.permute(0, 3, 2, 1)
+
+        fine_features_1 = nn.functional.unfold(
+            fine_features_1, kernel_size=self.fine_kernel_size + 2, stride=stride, padding=1
+        )
+        fine_features_1 = fine_features_1.reshape(batch_size, -1, (self.fine_kernel_size + 2) ** 2, seq_len)
+        fine_features_1 = fine_features_1.permute(0, 3, 2, 1)
+
+        return fine_features_0, fine_features_1
+
+
+@auto_docstring
+class EfficientLoFTRPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientLoFTRConfig
+    base_model_prefix = "efficientloftr"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_record_outputs = {
+        "hidden_states": EfficientLoFTRRepVGGBlock,
+        "attentions": EfficientLoFTRAttention,
+    }
+
+    def _init_weights(self, module: nn.Module) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv1d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    # Copied from transformers.models.superpoint.modeling_superpoint.SuperPointPreTrainedModel.extract_one_channel_pixel_values with SuperPoint->EfficientLoFTR
+    def extract_one_channel_pixel_values(self, pixel_values: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Assuming pixel_values has shape (batch_size, 3, height, width), and that all channels values are the same,
+        extract the first channel value to get a tensor of shape (batch_size, 1, height, width) for EfficientLoFTR. This is
+        a workaround for the issue discussed in :
+        https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446
+
+        Args:
+            pixel_values: torch.FloatTensor of shape (batch_size, 3, height, width)
+
+        Returns:
+            pixel_values: torch.FloatTensor of shape (batch_size, 1, height, width)
+
+        """
+        return pixel_values[:, 0, :, :][:, None, :, :]
+
+
+@auto_docstring(
+    custom_intro="""
+    EfficientLoFTR model taking images as inputs and outputting the features of the images.
+    """
+)
+class EfficientLoFTRModel(EfficientLoFTRPreTrainedModel):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__(config)
+
+        self.config = config
+        self.backbone = EfficientLoFTRepVGG(config)
+        self.local_feature_transformer = EfficientLoFTRLocalFeatureTransformer(config)
+        self.rotary_emb = EfficientLoFTRRotaryEmbedding(config=config)
+
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModel
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_78916675_4568141288.jpg?raw=true"
+        >>> image1 = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_19481797_2295892421.jpg?raw=true"
+        >>> image2 = Image.open(requests.get(url, stream=True).raw)
+        >>> images = [image1, image2]
+
+        >>> processor = AutoImageProcessor.from_pretrained("zju-community/efficient_loftr")
+        >>> model = AutoModel.from_pretrained("zju-community/efficient_loftr")
+
+        >>> with torch.no_grad():
+        >>>     inputs = processor(images, return_tensors="pt")
+        >>>     outputs = model(**inputs)
+        ```"""
+        if labels is not None:
+            raise ValueError("EfficientLoFTR is not trainable, no labels should be provided.")
+
+        if pixel_values.ndim != 5 or pixel_values.size(1) != 2:
+            raise ValueError("Input must be a 5D tensor of shape (batch_size, 2, num_channels, height, width)")
+
+        batch_size, _, channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * 2, channels, height, width)
+        pixel_values = self.extract_one_channel_pixel_values(pixel_values)
+
+        # 1. Local Feature CNN
+        features = self.backbone(pixel_values)
+        # Last stage outputs are coarse outputs
+        coarse_features = features[-1]
+        # Rest is residual features used in EfficientLoFTRFineFusionLayer
+        residual_features = features[:-1]
+        coarse_embed_dim, coarse_height, coarse_width = coarse_features.shape[-3:]
+
+        # 2. Coarse-level LoFTR module
+        cos, sin = self.rotary_emb(coarse_features)
+        cos = cos.expand(batch_size * 2, -1, -1, -1).reshape(batch_size * 2, -1, coarse_embed_dim)
+        sin = sin.expand(batch_size * 2, -1, -1, -1).reshape(batch_size * 2, -1, coarse_embed_dim)
+        position_embeddings = (cos, sin)
+
+        coarse_features = coarse_features.reshape(batch_size, 2, coarse_embed_dim, coarse_height, coarse_width)
+        coarse_features = self.local_feature_transformer(
+            coarse_features, position_embeddings=position_embeddings, **kwargs
+        )
+
+        features = (coarse_features,) + tuple(residual_features)
+
+        return BackboneOutput(feature_maps=features)
+
+
+def mask_border(tensor: torch.Tensor, border_margin: int, value: Union[bool, float, int]) -> torch.Tensor:
+    """
+    Mask a tensor border with a given value
+
+    Args:
+        tensor (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`):
+            The tensor to mask
+        border_margin (`int`) :
+            The size of the border
+        value (`Union[bool, int, float]`):
+            The value to place in the tensor's borders
+
+    Returns:
+        tensor (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`):
+            The masked tensor
+    """
+    if border_margin <= 0:
+        return tensor
+
+    tensor[:, :border_margin] = value
+    tensor[:, :, :border_margin] = value
+    tensor[:, :, :, :border_margin] = value
+    tensor[:, :, :, :, :border_margin] = value
+    tensor[:, -border_margin:] = value
+    tensor[:, :, -border_margin:] = value
+    tensor[:, :, :, -border_margin:] = value
+    tensor[:, :, :, :, -border_margin:] = value
+
+    return tensor
+
+
+def create_meshgrid(
+    height: Union[int, torch.Tensor],
+    width: Union[int, torch.Tensor],
+    normalized_coordinates: bool = False,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    """
+    Copied from kornia library : kornia/kornia/utils/grid.py:26
+
+    Generate a coordinate grid for an image.
+
+    When the flag ``normalized_coordinates`` is set to True, the grid is
+    normalized to be in the range :math:`[-1,1]` to be consistent with the pytorch
+    function :py:func:`torch.nn.functional.grid_sample`.
+
+    Args:
+        height (`int`):
+            The image height (rows).
+        width (`int`):
+            The image width (cols).
+        normalized_coordinates (`bool`):
+            Whether to normalize coordinates in the range :math:`[-1,1]` in order to be consistent with the
+            PyTorch function :py:func:`torch.nn.functional.grid_sample`.
+        device (`torch.device`):
+            The device on which the grid will be generated.
+        dtype (`torch.dtype`):
+            The data type of the generated grid.
+
+    Return:
+        grid (`torch.Tensor` of shape `(1, height, width, 2)`):
+            The grid tensor.
+
+    Example:
+        >>> create_meshgrid(2, 2)
+        tensor([[[[-1., -1.],
+                  [ 1., -1.]],
+        <BLANKLINE>
+                 [[-1.,  1.],
+                  [ 1.,  1.]]]])
+
+        >>> create_meshgrid(2, 2, normalized_coordinates=False)
+        tensor([[[[0., 0.],
+                  [1., 0.]],
+        <BLANKLINE>
+                 [[0., 1.],
+                  [1., 1.]]]])
+
+    """
+    xs = torch.linspace(0, width - 1, width, device=device, dtype=dtype)
+    ys = torch.linspace(0, height - 1, height, device=device, dtype=dtype)
+    if normalized_coordinates:
+        xs = (xs / (width - 1) - 0.5) * 2
+        ys = (ys / (height - 1) - 0.5) * 2
+    grid = torch.stack(torch.meshgrid(ys, xs, indexing="ij"), dim=-1)
+    grid = grid.permute(1, 0, 2).unsqueeze(0)
+    return grid
+
+
+def spatial_expectation2d(input: torch.Tensor, normalized_coordinates: bool = True) -> torch.Tensor:
+    r"""
+    Copied from kornia library : kornia/geometry/subpix/dsnt.py:76
+    Compute the expectation of coordinate values using spatial probabilities.
+
+    The input heatmap is assumed to represent a valid spatial probability distribution,
+    which can be achieved using :func:`~kornia.geometry.subpixel.spatial_softmax2d`.
+
+    Args:
+        input (`torch.Tensor` of shape `(batch_size, embed_dim, height, width)`):
+            The input tensor representing dense spatial probabilities.
+        normalized_coordinates (`bool`):
+            Whether to return the coordinates normalized in the range of :math:`[-1, 1]`. Otherwise, it will return
+            the coordinates in the range of the input shape.
+
+    Returns:
+        output (`torch.Tensor` of shape `(batch_size, embed_dim, 2)`)
+            Expected value of the 2D coordinates. Output order of the coordinates is (x, y).
+
+    Examples:
+        >>> heatmaps = torch.tensor([[[
+        ... [0., 0., 0.],
+        ... [0., 0., 0.],
+        ... [0., 1., 0.]]]])
+        >>> spatial_expectation2d(heatmaps, False)
+        tensor([[[1., 2.]]])
+
+    """
+    batch_size, embed_dim, height, width = input.shape
+
+    # Create coordinates grid.
+    grid = create_meshgrid(height, width, normalized_coordinates, input.device)
+    grid = grid.to(input.dtype)
+
+    pos_x = grid[..., 0].reshape(-1)
+    pos_y = grid[..., 1].reshape(-1)
+
+    input_flat = input.view(batch_size, embed_dim, -1)
+
+    # Compute the expectation of the coordinates.
+    expected_y = torch.sum(pos_y * input_flat, -1, keepdim=True)
+    expected_x = torch.sum(pos_x * input_flat, -1, keepdim=True)
+
+    output = torch.cat([expected_x, expected_y], -1)
+
+    return output.view(batch_size, embed_dim, 2)
+
+
+@auto_docstring(
+    custom_intro="""
+    EfficientLoFTR model taking images as inputs and outputting the matching of them.
+    """
+)
+class EfficientLoFTRForKeypointMatching(EfficientLoFTRPreTrainedModel):
+    """EfficientLoFTR dense image matcher
+
+    Given two images, we determine the correspondences by:
+      1. Extracting coarse and fine features through a backbone
+      2. Transforming coarse features through self and cross attention
+      3. Matching coarse features to obtain coarse coordinates of matches
+      4. Obtaining full resolution fine features by fusing transformed and backbone coarse features
+      5. Refining the coarse matches using fine feature patches centered at each coarse match in a two-stage refinement
+
+    Yifan Wang, Xingyi He, Sida Peng, Dongli Tan and Xiaowei Zhou.
+    Efficient LoFTR: Semi-Dense Local Feature Matching with Sparse-Like Speed
+    In CVPR, 2024. https://huggingface.co/papers/2403.04765
+    """
+
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__(config)
+
+        self.config = config
+        self.efficientloftr = EfficientLoFTRModel(config)
+        self.refinement_layer = EfficientLoFTRFineFusionLayer(config)
+
+        self.post_init()
+
+    def _get_matches_from_scores(self, scores: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Based on a keypoint score matrix, compute the best keypoint matches between the first and second image.
+        Since each image pair can have different number of matches, the matches are concatenated together for all pair
+        in the batch and a batch_indices tensor is returned to specify which match belong to which element in the batch.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            scores (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`):
+                Scores of keypoints
+
+        Returns:
+            matched_indices (`torch.Tensor` of shape `(2, num_matches)`):
+                Indices representing which pixel in the first image matches which pixel in the second image
+            matching_scores (`torch.Tensor` of shape `(num_matches,)`):
+                Scores of each match
+        """
+        batch_size, height0, width0, height1, width1 = scores.shape
+
+        scores = scores.view(batch_size, height0 * width0, height1 * width1)
+
+        # For each keypoint, get the best match
+        max_0 = scores.max(2, keepdim=True).values
+        max_1 = scores.max(1, keepdim=True).values
+
+        # 1. Thresholding
+        mask = scores > self.config.coarse_matching_threshold
+
+        # 2. Border removal
+        mask = mask.reshape(batch_size, height0, width0, height1, width1)
+        mask = mask_border(mask, self.config.coarse_matching_border_removal, False)
+        mask = mask.reshape(batch_size, height0 * width0, height1 * width1)
+
+        # 3. Mutual nearest neighbors
+        mask = mask * (scores == max_0) * (scores == max_1)
+
+        # 4. Fine coarse matches
+        masked_scores = scores * mask
+        matching_scores_0, max_indices_0 = masked_scores.max(1)
+        matching_scores_1, max_indices_1 = masked_scores.max(2)
+
+        matching_indices = torch.cat([max_indices_0, max_indices_1]).reshape(batch_size, 2, -1)
+        matching_scores = torch.stack([matching_scores_0, matching_scores_1], dim=1)
+
+        # For the keypoints not meeting the threshold score, set the indices to -1 which corresponds to no matches found
+        matching_indices = torch.where(matching_scores > 0, matching_indices, -1)
+
+        return matching_indices, matching_scores
+
+    def _coarse_matching(
+        self, coarse_features: torch.Tensor, coarse_scale: float
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        For each image pair, compute the matching confidence between each coarse element (by default (image_height / 8)
+        * (image_width / 8 elements)) from the first image to the second image.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            coarse_features (`torch.Tensor` of shape `(batch_size, 2, hidden_size, coarse_height, coarse_width)`):
+                Coarse features
+            coarse_scale (`float`): Scale between the image size and the coarse size
+
+        Returns:
+            keypoints (`torch.Tensor` of shape `(batch_size, 2, num_matches, 2)`):
+                Keypoints coordinates.
+            matching_scores (`torch.Tensor` of shape `(batch_size, 2, num_matches)`):
+                The confidence matching score of each keypoint.
+            matched_indices (`torch.Tensor` of shape `(batch_size, 2, num_matches)`):
+                Indices which indicates which keypoint in an image matched with which keypoint in the other image. For
+                both image in the pair.
+        """
+        batch_size, _, embed_dim, height, width = coarse_features.shape
+
+        # (batch_size, 2, embed_dim, height, width) -> (batch_size, 2, height * width, embed_dim)
+        coarse_features = coarse_features.permute(0, 1, 3, 4, 2)
+        coarse_features = coarse_features.reshape(batch_size, 2, -1, embed_dim)
+
+        coarse_features = coarse_features / coarse_features.shape[-1] ** 0.5
+        coarse_features_0 = coarse_features[:, 0]
+        coarse_features_1 = coarse_features[:, 1]
+
+        similarity = coarse_features_0 @ coarse_features_1.transpose(-1, -2)
+        similarity = similarity / self.config.coarse_matching_temperature
+
+        if self.config.coarse_matching_skip_softmax:
+            confidence = similarity
+        else:
+            confidence = nn.functional.softmax(similarity, 1) * nn.functional.softmax(similarity, 2)
+
+        confidence = confidence.view(batch_size, height, width, height, width)
+        matched_indices, matching_scores = self._get_matches_from_scores(confidence)
+
+        keypoints = torch.stack([matched_indices % width, matched_indices // width], dim=-1) * coarse_scale
+
+        return keypoints, matching_scores, matched_indices
+
+    def _get_first_stage_fine_matching(
+        self,
+        fine_confidence: torch.Tensor,
+        coarse_matched_keypoints: torch.Tensor,
+        fine_window_size: int,
+        fine_scale: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        For each coarse pixel, retrieve the highest fine confidence score and index.
+        The index represents the matching between a pixel position in the fine window in the first image and a pixel
+        position in the fine window of the second image.
+        For example, for a fine_window_size of 64 (8 * 8), the index 2474 represents the matching between the index 38
+        (2474 // 64) in the fine window of the first image, and the index 42 in the second image. This means that 38
+        which corresponds to the position (4, 6) (4 // 8 and 4 % 8) is matched with the position (5, 2). In this example
+        the coarse matched coordinate will be shifted to the matched fine coordinates in the first and second image.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            fine_confidence (`torch.Tensor` of shape `(num_matches, fine_window_size, fine_window_size)`):
+                First stage confidence of matching fine features between the first and the second image
+            coarse_matched_keypoints (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coarse matched keypoint between the first and the second image.
+            fine_window_size (`int`):
+                Size of the window used to refine matches
+            fine_scale (`float`):
+                Scale between the size of fine features and coarse features
+
+        Returns:
+            indices (`torch.Tensor` of shape `(2, num_matches, 1)`):
+                Indices of the fine coordinate matched in the fine window
+            fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coordinates of matched keypoints after the first fine stage
+        """
+        batch_size, num_keypoints, _, _ = fine_confidence.shape
+        fine_kernel_size = torch_int(fine_window_size**0.5)
+
+        fine_confidence = fine_confidence.reshape(batch_size, num_keypoints, -1)
+        values, indices = torch.max(fine_confidence, dim=-1)
+        indices = indices[..., None]
+        indices_0 = indices // fine_window_size
+        indices_1 = indices % fine_window_size
+
+        grid = create_meshgrid(
+            fine_kernel_size,
+            fine_kernel_size,
+            normalized_coordinates=False,
+            device=fine_confidence.device,
+            dtype=fine_confidence.dtype,
+        )
+        grid = grid - (fine_kernel_size // 2) + 0.5
+        grid = grid.reshape(1, 1, -1, 2).expand(batch_size, num_keypoints, -1, -1)
+        delta_0 = torch.gather(grid, 1, indices_0.unsqueeze(-1).expand(-1, -1, -1, 2)).squeeze(2)
+        delta_1 = torch.gather(grid, 1, indices_1.unsqueeze(-1).expand(-1, -1, -1, 2)).squeeze(2)
+
+        fine_matches_0 = coarse_matched_keypoints[:, 0] + delta_0 * fine_scale
+        fine_matches_1 = coarse_matched_keypoints[:, 1] + delta_1 * fine_scale
+
+        indices = torch.stack([indices_0, indices_1], dim=1)
+        fine_matches = torch.stack([fine_matches_0, fine_matches_1], dim=1)
+
+        return indices, fine_matches
+
+    def _get_second_stage_fine_matching(
+        self,
+        indices: torch.Tensor,
+        fine_matches: torch.Tensor,
+        fine_confidence: torch.Tensor,
+        fine_window_size: int,
+        fine_scale: float,
+    ) -> torch.Tensor:
+        """
+        For the given position in their respective fine windows, retrieve the 3x3 fine confidences around this position.
+        After applying softmax to these confidences, compute the 2D spatial expected coordinates.
+        Shift the first stage fine matching with these expected coordinates.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            indices (`torch.Tensor` of shape `(batch_size, 2, num_keypoints)`):
+                Indices representing the position of each keypoint in the fine window
+            fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coordinates of matched keypoints after the first fine stage
+            fine_confidence (`torch.Tensor` of shape `(num_matches, fine_window_size, fine_window_size)`):
+                Second stage confidence of matching fine features between the first and the second image
+            fine_window_size (`int`):
+                Size of the window used to refine matches
+            fine_scale (`float`):
+                Scale between the size of fine features and coarse features
+
+        Returns:
+            fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coordinates of matched keypoints after the second fine stage
+        """
+        batch_size, num_keypoints, _, _ = fine_confidence.shape
+        fine_kernel_size = torch_int(fine_window_size**0.5)
+
+        indices_0 = indices[:, 0]
+        indices_1 = indices[:, 1]
+        indices_1_i = indices_1 // fine_kernel_size
+        indices_1_j = indices_1 % fine_kernel_size
+
+        # matches_indices, indices_0, indices_1_i, indices_1_j of shape (num_matches, 3, 3)
+        batch_indices = torch.arange(batch_size, device=indices_0.device).reshape(batch_size, 1, 1, 1)
+        matches_indices = torch.arange(num_keypoints, device=indices_0.device).reshape(1, num_keypoints, 1, 1)
+        indices_0 = indices_0[..., None]
+        indices_1_i = indices_1_i[..., None]
+        indices_1_j = indices_1_j[..., None]
+
+        delta = create_meshgrid(3, 3, normalized_coordinates=True, device=indices_0.device).to(torch.long)
+        delta = delta[None, ...]
+
+        indices_1_i = indices_1_i + delta[..., 1]
+        indices_1_j = indices_1_j + delta[..., 0]
+
+        fine_confidence = fine_confidence.reshape(
+            batch_size, num_keypoints, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2
+        )
+        # (batch_size, seq_len, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2) -> (batch_size, seq_len, 3, 3)
+        fine_confidence = fine_confidence[batch_indices, matches_indices, indices_0, indices_1_i, indices_1_j]
+        fine_confidence = fine_confidence.reshape(batch_size, num_keypoints, 9)
+        fine_confidence = nn.functional.softmax(
+            fine_confidence / self.config.fine_matching_regress_temperature, dim=-1
+        )
+
+        heatmap = fine_confidence.reshape(batch_size, num_keypoints, 3, 3)
+        fine_coordinates_normalized = spatial_expectation2d(heatmap, True)[0]
+
+        fine_matches_0 = fine_matches[:, 0]
+        fine_matches_1 = fine_matches[:, 1] + (fine_coordinates_normalized * (3 // 2) * fine_scale)
+
+        fine_matches = torch.stack([fine_matches_0, fine_matches_1], dim=1)
+
+        return fine_matches
+
+    def _fine_matching(
+        self,
+        fine_features_0: torch.Tensor,
+        fine_features_1: torch.Tensor,
+        coarse_matched_keypoints: torch.Tensor,
+        fine_scale: float,
+    ) -> torch.Tensor:
+        """
+        For each coarse pixel with a corresponding window of fine features, compute the matching confidence between fine
+        features in the first image and the second image.
+
+        Fine features are sliced in two part :
+        - The first part used for the first stage are the first fine_hidden_size - config.fine_matching_slicedim (64 - 8
+         = 56 by default) features.
+        - The second part used for the second stage are the last config.fine_matching_slicedim (8 by default) features.
+
+        Each part is used to compute a fine confidence tensor of the following shape :
+        (batch_size, (coarse_height * coarse_width), fine_window_size, fine_window_size)
+        They correspond to the score between each fine pixel in the first image and each fine pixel in the second image.
+
+        Args:
+            fine_features_0 (`torch.Tensor` of shape `(num_matches, fine_kernel_size ** 2, fine_kernel_size ** 2)`):
+                Fine features from the first image
+            fine_features_1 (`torch.Tensor` of shape `(num_matches, (fine_kernel_size + 2) ** 2, (fine_kernel_size + 2)
+            ** 2)`):
+                Fine features from the second image
+            coarse_matched_keypoints (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Keypoint coordinates found in coarse matching for the first and second image
+            fine_scale (`int`):
+                Scale between the size of fine features and coarse features
+
+        Returns:
+            fine_coordinates (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Matched keypoint between the first and the second image. All matched keypoints are concatenated in the
+                second dimension.
+
+        """
+        batch_size, num_keypoints, fine_window_size, fine_embed_dim = fine_features_0.shape
+        fine_matching_slice_dim = self.config.fine_matching_slice_dim
+
+        fine_kernel_size = torch_int(fine_window_size**0.5)
+
+        # Split fine features into first and second stage features
+        split_fine_features_0 = torch.split(fine_features_0, fine_embed_dim - fine_matching_slice_dim, -1)
+        split_fine_features_1 = torch.split(fine_features_1, fine_embed_dim - fine_matching_slice_dim, -1)
+
+        # Retrieve first stage fine features
+        fine_features_0 = split_fine_features_0[0]
+        fine_features_1 = split_fine_features_1[0]
+
+        # Normalize first stage fine features
+        fine_features_0 = fine_features_0 / fine_features_0.shape[-1] ** 0.5
+        fine_features_1 = fine_features_1 / fine_features_1.shape[-1] ** 0.5
+
+        # Compute first stage confidence
+        fine_confidence = fine_features_0 @ fine_features_1.transpose(-1, -2)
+        fine_confidence = nn.functional.softmax(fine_confidence, 1) * nn.functional.softmax(fine_confidence, 2)
+        fine_confidence = fine_confidence.reshape(
+            batch_size, num_keypoints, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2
+        )
+        fine_confidence = fine_confidence[..., 1:-1, 1:-1]
+        first_stage_fine_confidence = fine_confidence.reshape(
+            batch_size, num_keypoints, fine_window_size, fine_window_size
+        )
+
+        fine_indices, fine_matches = self._get_first_stage_fine_matching(
+            first_stage_fine_confidence,
+            coarse_matched_keypoints,
+            fine_window_size,
+            fine_scale,
+        )
+
+        # Retrieve second stage fine features
+        fine_features_0 = split_fine_features_0[1]
+        fine_features_1 = split_fine_features_1[1]
+
+        # Normalize second stage fine features
+        fine_features_1 = fine_features_1 / fine_matching_slice_dim**0.5
+
+        # Compute second stage fine confidence
+        second_stage_fine_confidence = fine_features_0 @ fine_features_1.transpose(-1, -2)
+
+        fine_coordinates = self._get_second_stage_fine_matching(
+            fine_indices,
+            fine_matches,
+            second_stage_fine_confidence,
+            fine_window_size,
+            fine_scale,
+        )
+
+        return fine_coordinates
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> KeypointMatchingOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModel
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_78916675_4568141288.jpg?raw=true"
+        >>> image1 = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_19481797_2295892421.jpg?raw=true"
+        >>> image2 = Image.open(requests.get(url, stream=True).raw)
+        >>> images = [image1, image2]
+
+        >>> processor = AutoImageProcessor.from_pretrained("zju-community/efficient_loftr")
+        >>> model = AutoModel.from_pretrained("zju-community/efficient_loftr")
+
+        >>> with torch.no_grad():
+        >>>     inputs = processor(images, return_tensors="pt")
+        >>>     outputs = model(**inputs)
+        ```"""
+        if labels is not None:
+            raise ValueError("SuperGlue is not trainable, no labels should be provided.")
+
+        # 1. Extract coarse and residual features
+        model_outputs: BackboneOutput = self.efficientloftr(pixel_values, **kwargs)
+        features = model_outputs.feature_maps
+
+        # 2. Compute coarse-level matching
+        coarse_features = features[0]
+        coarse_embed_dim, coarse_height, coarse_width = coarse_features.shape[-3:]
+        batch_size, _, channels, height, width = pixel_values.shape
+        coarse_scale = height / coarse_height
+        coarse_keypoints, coarse_matching_scores, coarse_matched_indices = self._coarse_matching(
+            coarse_features, coarse_scale
+        )
+
+        # 3. Fine-level refinement
+        residual_features = features[1:]
+        coarse_features = coarse_features / self.config.hidden_size**0.5
+        fine_features_0, fine_features_1 = self.refinement_layer(coarse_features, residual_features)
+
+        # Filter fine features with coarse matches indices
+        _, _, num_keypoints = coarse_matching_scores.shape
+        batch_indices = torch.arange(batch_size)[..., None]
+        fine_features_0 = fine_features_0[batch_indices, coarse_matched_indices[:, 0]]
+        fine_features_1 = fine_features_1[batch_indices, coarse_matched_indices[:, 1]]
+
+        # 4. Computer fine-level matching
+        fine_height = torch_int(coarse_height * coarse_scale)
+        fine_scale = height / fine_height
+        matching_keypoints = self._fine_matching(fine_features_0, fine_features_1, coarse_keypoints, fine_scale)
+
+        matching_keypoints[:, :, :, 0] = matching_keypoints[:, :, :, 0] / width
+        matching_keypoints[:, :, :, 1] = matching_keypoints[:, :, :, 1] / height
+
+        return KeypointMatchingOutput(
+            matches=coarse_matched_indices,
+            matching_scores=coarse_matching_scores,
+            keypoints=matching_keypoints,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
+
+
+__all__ = ["EfficientLoFTRPreTrainedModel", "EfficientLoFTRModel", "EfficientLoFTRForKeypointMatching"]
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/__init__.py b/phivenv/Lib/site-packages/transformers/models/efficientnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d58e81167ec8729d04fa52ce96ebc1737a5982
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientnet/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_efficientnet import *
+    from .image_processing_efficientnet import *
+    from .image_processing_efficientnet_fast import *
+    from .modeling_efficientnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a694d467a48167547ad58725473b0ee57a508c55
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67426a1158816338c08133ed1a65ecb3b1d3efc6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e07d4204e076bca18424a8ee57ad6bb663f857
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f5f429288cbbb8fe846bce2e7a067ac12c6bbc8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e7a115703ffa99ec4011082242388264638162c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/configuration_efficientnet.py b/phivenv/Lib/site-packages/transformers/models/efficientnet/configuration_efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..549311902903357604bcf09bd28ddb391d007793
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientnet/configuration_efficientnet.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2023 Google Research, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""EfficientNet model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EfficientNetModel`]. It is used to instantiate an
+    EfficientNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the EfficientNet
+    [google/efficientnet-b7](https://huggingface.co/google/efficientnet-b7) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 600):
+            The input image size.
+        width_coefficient (`float`, *optional*, defaults to 2.0):
+            Scaling coefficient for network width at each stage.
+        depth_coefficient (`float`, *optional*, defaults to 3.1):
+            Scaling coefficient for network depth at each stage.
+        depth_divisor `int`, *optional*, defaults to 8):
+            A unit of network width.
+        kernel_sizes (`list[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
+            List of kernel sizes to be used in each block.
+        in_channels (`list[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
+            List of input channel sizes to be used in each block for convolutional layers.
+        out_channels (`list[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
+            List of output channel sizes to be used in each block for convolutional layers.
+        depthwise_padding (`list[int]`, *optional*, defaults to `[]`):
+            List of block indices with square padding.
+        strides (`list[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
+            List of stride sizes to be used in each block for convolutional layers.
+        num_block_repeats (`list[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
+            List of the number of times each block is to repeated.
+        expand_ratios (`list[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
+            List of scaling coefficient of each block.
+        squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
+            Squeeze expansion ratio.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
+            `"selu", `"gelu_new"`, `"silu"` and `"mish"` are supported.
+        hidden_dim (`int`, *optional*, defaults to 1280):
+            The hidden dimension of the layer before the classification head.
+        pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
+            Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`,
+            `"max"`]
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-3):
+            The epsilon used by the batch normalization layers.
+        batch_norm_momentum (`float`, *optional*, defaults to 0.99):
+            The momentum used by the batch normalization layers.
+        dropout_rate (`float`, *optional*, defaults to 0.5):
+            The dropout rate to be applied before final classifier layer.
+        drop_connect_rate (`float`, *optional*, defaults to 0.2):
+            The drop rate for skip connections.
+
+    Example:
+    ```python
+    >>> from transformers import EfficientNetConfig, EfficientNetModel
+
+    >>> # Initializing a EfficientNet efficientnet-b7 style configuration
+    >>> configuration = EfficientNetConfig()
+
+    >>> # Initializing a model (with random weights) from the efficientnet-b7 style configuration
+    >>> model = EfficientNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "efficientnet"
+
+    def __init__(
+        self,
+        num_channels: int = 3,
+        image_size: int = 600,
+        width_coefficient: float = 2.0,
+        depth_coefficient: float = 3.1,
+        depth_divisor: int = 8,
+        kernel_sizes: list[int] = [3, 3, 5, 3, 5, 5, 3],
+        in_channels: list[int] = [32, 16, 24, 40, 80, 112, 192],
+        out_channels: list[int] = [16, 24, 40, 80, 112, 192, 320],
+        depthwise_padding: list[int] = [],
+        strides: list[int] = [1, 2, 2, 2, 1, 2, 1],
+        num_block_repeats: list[int] = [1, 2, 2, 3, 3, 4, 1],
+        expand_ratios: list[int] = [1, 6, 6, 6, 6, 6, 6],
+        squeeze_expansion_ratio: float = 0.25,
+        hidden_act: str = "swish",
+        hidden_dim: int = 2560,
+        pooling_type: str = "mean",
+        initializer_range: float = 0.02,
+        batch_norm_eps: float = 0.001,
+        batch_norm_momentum: float = 0.99,
+        dropout_rate: float = 0.5,
+        drop_connect_rate: float = 0.2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.width_coefficient = width_coefficient
+        self.depth_coefficient = depth_coefficient
+        self.depth_divisor = depth_divisor
+        self.kernel_sizes = kernel_sizes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.depthwise_padding = depthwise_padding
+        self.strides = strides
+        self.num_block_repeats = num_block_repeats
+        self.expand_ratios = expand_ratios
+        self.squeeze_expansion_ratio = squeeze_expansion_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dim = hidden_dim
+        self.pooling_type = pooling_type
+        self.initializer_range = initializer_range
+        self.batch_norm_eps = batch_norm_eps
+        self.batch_norm_momentum = batch_norm_momentum
+        self.dropout_rate = dropout_rate
+        self.drop_connect_rate = drop_connect_rate
+        self.num_hidden_layers = sum(num_block_repeats) * 4
+
+
+class EfficientNetOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+
+__all__ = ["EfficientNetConfig", "EfficientNetOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/image_processing_efficientnet.py b/phivenv/Lib/site-packages/transformers/models/efficientnet/image_processing_efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..880bfd21995edb162762d827c75ae380ca5aa594
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -0,0 +1,369 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for EfficientNet."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientNetImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EfficientNet image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 346, "width": 346}`):
+            Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling` filter, *optional*, defaults to 0):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        do_center_crop (`bool`, *optional*, defaults to `False`):
+            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
+            is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
+        crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 289, "width": 289}`):
+            Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        rescale_offset (`bool`, *optional*, defaults to `False`):
+            Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range]. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        include_top (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image again. Should be set to True if the inputs are used for image classification.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PIL.Image.NEAREST,
+        do_center_crop: bool = False,
+        crop_size: Optional[dict[str, int]] = None,
+        rescale_factor: Union[int, float] = 1 / 255,
+        rescale_offset: bool = False,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        include_top: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 346, "width": 346}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 289, "width": 289}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.rescale_offset = rescale_offset
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.include_top = include_top
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.NEAREST
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.NEAREST`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.NEAREST`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        offset: bool = True,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Rescale an image by a scale factor.
+
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
+
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
+            image = image * scale
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            offset (`bool`, *optional*):
+                Whether to scale the image in both negative and positive directions.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        rescaled_image = rescale(
+            image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+        if offset:
+            rescaled_image = rescaled_image - 1
+
+        return rescaled_image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        rescale_offset: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        include_top: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after `resize`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                PILImageResampling filter to use if resizing the image Only has an effect if `do_resize` is set to
+                `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be
+                padded with zeros and then cropped
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
+                Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range].
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            include_top (`bool`, *optional*, defaults to `self.include_top`):
+                Rescales the image again for image classification if set to True.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - `None`: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        rescale_offset = rescale_offset if rescale_offset is not None else self.rescale_offset
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        include_top = include_top if include_top is not None else self.include_top
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(
+                    image=image, scale=rescale_factor, offset=rescale_offset, input_data_format=input_data_format
+                )
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if include_top:
+            images = [
+                self.normalize(image=image, mean=0, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["EfficientNetImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/image_processing_efficientnet_fast.py b/phivenv/Lib/site-packages/transformers/models/efficientnet/image_processing_efficientnet_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..41689e3dc080e7795fc967bc63671aae704e07d7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientnet/image_processing_efficientnet_fast.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for EfficientNet."""
+
+from functools import lru_cache
+from typing import Optional, Union
+
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_transforms import group_images_by_shape, reorder_images
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    Args:
+        rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
+            Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
+        include_top (`bool`, *optional*, defaults to `self.include_top`):
+            Normalize the image again with the standard deviation only for image classification if set to True.
+    """
+
+    rescale_offset: bool
+    include_top: bool
+
+
+@auto_docstring
+class EfficientNetImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.NEAREST
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 346, "width": 346}
+    crop_size = {"height": 289, "width": 289}
+    do_resize = True
+    do_center_crop = False
+    do_rescale = True
+    rescale_factor = 1 / 255
+    rescale_offset = False
+    do_normalize = True
+    include_top = True
+    valid_kwargs = EfficientNetFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def rescale(
+        self,
+        image: "torch.Tensor",
+        scale: float,
+        offset: Optional[bool] = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Rescale an image by a scale factor.
+
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
+
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
+            image = image * scale
+
+        Args:
+            image (`torch.Tensor`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            offset (`bool`, *optional*):
+                Whether to scale the image in both negative and positive directions.
+
+        Returns:
+            `torch.Tensor`: The rescaled image.
+        """
+
+        rescaled_image = image * scale
+
+        if offset:
+            rescaled_image -= 1
+
+        return rescaled_image
+
+    @lru_cache(maxsize=10)
+    def _fuse_mean_std_and_rescale_factor(
+        self,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        device: Optional["torch.device"] = None,
+        rescale_offset: Optional[bool] = False,
+    ) -> tuple:
+        if do_rescale and do_normalize and not rescale_offset:
+            # Fused rescale and normalize
+            image_mean = torch.tensor(image_mean, device=device) * (1.0 / rescale_factor)
+            image_std = torch.tensor(image_std, device=device) * (1.0 / rescale_factor)
+            do_rescale = False
+        return image_mean, image_std, do_rescale
+
+    def rescale_and_normalize(
+        self,
+        images: "torch.Tensor",
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, list[float]],
+        image_std: Union[float, list[float]],
+        rescale_offset: bool = False,
+    ) -> "torch.Tensor":
+        """
+        Rescale and normalize images.
+        """
+        image_mean, image_std, do_rescale = self._fuse_mean_std_and_rescale_factor(
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            device=images.device,
+            rescale_offset=rescale_offset,
+        )
+        # if/elif as we use fused rescale and normalize if both are set to True
+        if do_rescale:
+            images = self.rescale(images, rescale_factor, rescale_offset)
+        if do_normalize:
+            images = self.normalize(images.to(dtype=torch.float32), image_mean, image_std)
+
+        return images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        rescale_offset: bool,
+        do_normalize: bool,
+        include_top: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std, rescale_offset
+            )
+            if include_top:
+                stacked_images = self.normalize(stacked_images, 0, image_std)
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+
+__all__ = ["EfficientNetImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/efficientnet/modeling_efficientnet.py b/phivenv/Lib/site-packages/transformers/models/efficientnet/modeling_efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4de89316b759c211d6318040962b609f7f60bab9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/efficientnet/modeling_efficientnet.py
@@ -0,0 +1,582 @@
+# coding=utf-8
+# Copyright 2023 Google Research, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch EfficientNet model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from .configuration_efficientnet import EfficientNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def round_filters(config: EfficientNetConfig, num_channels: int):
+    r"""
+    Round number of filters based on depth multiplier.
+    """
+    divisor = config.depth_divisor
+    num_channels *= config.width_coefficient
+    new_dim = max(divisor, int(num_channels + divisor / 2) // divisor * divisor)
+
+    # Make sure that round down does not go down by more than 10%.
+    if new_dim < 0.9 * num_channels:
+        new_dim += divisor
+
+    return int(new_dim)
+
+
+def correct_pad(kernel_size: Union[int, tuple], adjust: bool = True):
+    r"""
+    Utility function to get the tuple padding value for the depthwise convolution.
+
+    Args:
+        kernel_size (`int` or `tuple`):
+            Kernel size of the convolution layers.
+        adjust (`bool`, *optional*, defaults to `True`):
+            Adjusts padding value to apply to right and bottom sides of the input.
+    """
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+
+    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
+    if adjust:
+        return (correct[1] - 1, correct[1], correct[0] - 1, correct[0])
+    else:
+        return (correct[1], correct[1], correct[0], correct[0])
+
+
+class EfficientNetEmbeddings(nn.Module):
+    r"""
+    A module that corresponds to the stem module of the original work.
+    """
+
+    def __init__(self, config: EfficientNetConfig):
+        super().__init__()
+
+        self.out_dim = round_filters(config, 32)
+        self.padding = nn.ZeroPad2d(padding=(0, 1, 0, 1))
+        self.convolution = nn.Conv2d(
+            config.num_channels, self.out_dim, kernel_size=3, stride=2, padding="valid", bias=False
+        )
+        self.batchnorm = nn.BatchNorm2d(self.out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum)
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        features = self.padding(pixel_values)
+        features = self.convolution(features)
+        features = self.batchnorm(features)
+        features = self.activation(features)
+
+        return features
+
+
+class EfficientNetDepthwiseConv2d(nn.Conv2d):
+    def __init__(
+        self,
+        in_channels,
+        depth_multiplier=1,
+        kernel_size=3,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        padding_mode="zeros",
+    ):
+        out_channels = in_channels * depth_multiplier
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+            padding_mode=padding_mode,
+        )
+
+
+class EfficientNetExpansionLayer(nn.Module):
+    r"""
+    This corresponds to the expansion phase of each block in the original implementation.
+    """
+
+    def __init__(self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int):
+        super().__init__()
+        self.expand_conv = nn.Conv2d(
+            in_channels=in_dim,
+            out_channels=out_dim,
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.expand_bn = nn.BatchNorm2d(num_features=out_dim, eps=config.batch_norm_eps)
+        self.expand_act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        # Expand phase
+        hidden_states = self.expand_conv(hidden_states)
+        hidden_states = self.expand_bn(hidden_states)
+        hidden_states = self.expand_act(hidden_states)
+
+        return hidden_states
+
+
+class EfficientNetDepthwiseLayer(nn.Module):
+    r"""
+    This corresponds to the depthwise convolution phase of each block in the original implementation.
+    """
+
+    def __init__(
+        self,
+        config: EfficientNetConfig,
+        in_dim: int,
+        stride: int,
+        kernel_size: int,
+        adjust_padding: bool,
+    ):
+        super().__init__()
+        self.stride = stride
+        conv_pad = "valid" if self.stride == 2 else "same"
+        padding = correct_pad(kernel_size, adjust=adjust_padding)
+
+        self.depthwise_conv_pad = nn.ZeroPad2d(padding=padding)
+        self.depthwise_conv = EfficientNetDepthwiseConv2d(
+            in_dim, kernel_size=kernel_size, stride=stride, padding=conv_pad, bias=False
+        )
+        self.depthwise_norm = nn.BatchNorm2d(
+            num_features=in_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.depthwise_act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        # Depthwise convolution
+        if self.stride == 2:
+            hidden_states = self.depthwise_conv_pad(hidden_states)
+
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.depthwise_norm(hidden_states)
+        hidden_states = self.depthwise_act(hidden_states)
+
+        return hidden_states
+
+
+class EfficientNetSqueezeExciteLayer(nn.Module):
+    r"""
+    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
+    """
+
+    def __init__(self, config: EfficientNetConfig, in_dim: int, expand_dim: int, expand: bool = False):
+        super().__init__()
+        self.dim = expand_dim if expand else in_dim
+        self.dim_se = max(1, int(in_dim * config.squeeze_expansion_ratio))
+
+        self.squeeze = nn.AdaptiveAvgPool2d(output_size=1)
+        self.reduce = nn.Conv2d(
+            in_channels=self.dim,
+            out_channels=self.dim_se,
+            kernel_size=1,
+            padding="same",
+        )
+        self.expand = nn.Conv2d(
+            in_channels=self.dim_se,
+            out_channels=self.dim,
+            kernel_size=1,
+            padding="same",
+        )
+        self.act_reduce = ACT2FN[config.hidden_act]
+        self.act_expand = nn.Sigmoid()
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        inputs = hidden_states
+        hidden_states = self.squeeze(hidden_states)
+        hidden_states = self.reduce(hidden_states)
+        hidden_states = self.act_reduce(hidden_states)
+
+        hidden_states = self.expand(hidden_states)
+        hidden_states = self.act_expand(hidden_states)
+        hidden_states = torch.mul(inputs, hidden_states)
+
+        return hidden_states
+
+
+class EfficientNetFinalBlockLayer(nn.Module):
+    r"""
+    This corresponds to the final phase of each block in the original implementation.
+    """
+
+    def __init__(
+        self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int, drop_rate: float, id_skip: bool
+    ):
+        super().__init__()
+        self.apply_dropout = stride == 1 and not id_skip
+        self.project_conv = nn.Conv2d(
+            in_channels=in_dim,
+            out_channels=out_dim,
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.project_bn = nn.BatchNorm2d(
+            num_features=out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.dropout = nn.Dropout(p=drop_rate)
+
+    def forward(self, embeddings: torch.FloatTensor, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        hidden_states = self.project_conv(hidden_states)
+        hidden_states = self.project_bn(hidden_states)
+
+        if self.apply_dropout:
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = hidden_states + embeddings
+
+        return hidden_states
+
+
+class EfficientNetBlock(nn.Module):
+    r"""
+    This corresponds to the expansion and depthwise convolution phase of each block in the original implementation.
+
+    Args:
+        config ([`EfficientNetConfig`]):
+            Model configuration class.
+        in_dim (`int`):
+            Number of input channels.
+        out_dim (`int`):
+            Number of output channels.
+        stride (`int`):
+            Stride size to be used in convolution layers.
+        expand_ratio (`int`):
+            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
+        kernel_size (`int`):
+            Kernel size for the depthwise convolution layer.
+        drop_rate (`float`):
+            Dropout rate to be used in the final phase of each block.
+        id_skip (`bool`):
+            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
+            of each block. Set to `True` for the first block of each stage.
+        adjust_padding (`bool`):
+            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
+            operation, set to `True` for inputs with odd input sizes.
+    """
+
+    def __init__(
+        self,
+        config: EfficientNetConfig,
+        in_dim: int,
+        out_dim: int,
+        stride: int,
+        expand_ratio: int,
+        kernel_size: int,
+        drop_rate: float,
+        id_skip: bool,
+        adjust_padding: bool,
+    ):
+        super().__init__()
+        self.expand_ratio = expand_ratio
+        self.expand = self.expand_ratio != 1
+        expand_in_dim = in_dim * expand_ratio
+
+        if self.expand:
+            self.expansion = EfficientNetExpansionLayer(
+                config=config, in_dim=in_dim, out_dim=expand_in_dim, stride=stride
+            )
+
+        self.depthwise_conv = EfficientNetDepthwiseLayer(
+            config=config,
+            in_dim=expand_in_dim if self.expand else in_dim,
+            stride=stride,
+            kernel_size=kernel_size,
+            adjust_padding=adjust_padding,
+        )
+        self.squeeze_excite = EfficientNetSqueezeExciteLayer(
+            config=config, in_dim=in_dim, expand_dim=expand_in_dim, expand=self.expand
+        )
+        self.projection = EfficientNetFinalBlockLayer(
+            config=config,
+            in_dim=expand_in_dim if self.expand else in_dim,
+            out_dim=out_dim,
+            stride=stride,
+            drop_rate=drop_rate,
+            id_skip=id_skip,
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
+        embeddings = hidden_states
+        # Expansion and depthwise convolution phase
+        if self.expand_ratio != 1:
+            hidden_states = self.expansion(hidden_states)
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        # Squeeze and excite phase
+        hidden_states = self.squeeze_excite(hidden_states)
+        hidden_states = self.projection(embeddings, hidden_states)
+        return hidden_states
+
+
+class EfficientNetEncoder(nn.Module):
+    r"""
+    Forward propagates the embeddings through each EfficientNet block.
+
+    Args:
+        config ([`EfficientNetConfig`]):
+            Model configuration class.
+    """
+
+    def __init__(self, config: EfficientNetConfig):
+        super().__init__()
+        self.config = config
+        self.depth_coefficient = config.depth_coefficient
+
+        def round_repeats(repeats):
+            # Round number of block repeats based on depth multiplier.
+            return int(math.ceil(self.depth_coefficient * repeats))
+
+        num_base_blocks = len(config.in_channels)
+        num_blocks = sum(round_repeats(n) for n in config.num_block_repeats)
+
+        curr_block_num = 0
+        blocks = []
+        for i in range(num_base_blocks):
+            in_dim = round_filters(config, config.in_channels[i])
+            out_dim = round_filters(config, config.out_channels[i])
+            stride = config.strides[i]
+            kernel_size = config.kernel_sizes[i]
+            expand_ratio = config.expand_ratios[i]
+
+            for j in range(round_repeats(config.num_block_repeats[i])):
+                id_skip = j == 0
+                stride = 1 if j > 0 else stride
+                in_dim = out_dim if j > 0 else in_dim
+                adjust_padding = curr_block_num not in config.depthwise_padding
+                drop_rate = config.drop_connect_rate * curr_block_num / num_blocks
+
+                block = EfficientNetBlock(
+                    config=config,
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    stride=stride,
+                    kernel_size=kernel_size,
+                    expand_ratio=expand_ratio,
+                    drop_rate=drop_rate,
+                    id_skip=id_skip,
+                    adjust_padding=adjust_padding,
+                )
+                blocks.append(block)
+                curr_block_num += 1
+
+        self.blocks = nn.ModuleList(blocks)
+        self.top_conv = nn.Conv2d(
+            in_channels=out_dim,
+            out_channels=round_filters(config, 1280),
+            kernel_size=1,
+            padding="same",
+            bias=False,
+        )
+        self.top_bn = nn.BatchNorm2d(
+            num_features=config.hidden_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
+        )
+        self.top_activation = ACT2FN[config.hidden_act]
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> BaseModelOutputWithNoAttention:
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+        hidden_states = self.top_conv(hidden_states)
+        hidden_states = self.top_bn(hidden_states)
+        hidden_states = self.top_activation(hidden_states)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+@auto_docstring
+class EfficientNetPreTrainedModel(PreTrainedModel):
+    config: EfficientNetConfig
+    base_model_prefix = "efficientnet"
+    main_input_name = "pixel_values"
+    _no_split_modules = []
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@auto_docstring
+class EfficientNetModel(EfficientNetPreTrainedModel):
+    def __init__(self, config: EfficientNetConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = EfficientNetEmbeddings(config)
+        self.encoder = EfficientNetEncoder(config)
+
+        # Final pooling layer
+        if config.pooling_type == "mean":
+            self.pooler = nn.AvgPool2d(config.hidden_dim, ceil_mode=True)
+        elif config.pooling_type == "max":
+            self.pooler = nn.MaxPool2d(config.hidden_dim, ceil_mode=True)
+        else:
+            raise ValueError(f"config.pooling must be one of ['mean', 'max'] got {config.pooling}")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # Apply pooling
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.pooler(last_hidden_state)
+        # Reshape (batch_size, 1280, 1 , 1) -> (batch_size, 1280)
+        pooled_output = pooled_output.reshape(pooled_output.shape[:2])
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g.
+    for ImageNet.
+    """
+)
+class EfficientNetForImageClassification(EfficientNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.efficientnet = EfficientNetModel(config)
+        # Classifier head
+        self.dropout = nn.Dropout(p=config.dropout_rate)
+        self.classifier = nn.Linear(config.hidden_dim, self.num_labels) if self.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+__all__ = ["EfficientNetForImageClassification", "EfficientNetModel", "EfficientNetPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__init__.py b/phivenv/Lib/site-packages/transformers/models/electra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a78ed5c42aea51038335efabde5b03e333592ed6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/electra/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_electra import *
+    from .modeling_electra import *
+    from .modeling_flax_electra import *
+    from .modeling_tf_electra import *
+    from .tokenization_electra import *
+    from .tokenization_electra_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01a5af257a393cd14790e2b560f039ae9cb46d3a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/configuration_electra.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/configuration_electra.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e4cd78c608069b8561fe138a9a57920fdb0a15d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/configuration_electra.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_electra.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_electra.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73cd70e5c95d98263016ee49eb7ea593c5f92f41
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_electra.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_flax_electra.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_flax_electra.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdd2f6a9a987138ecff56950c4e236cfca57bbcb
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_flax_electra.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_tf_electra.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_tf_electra.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..100a14e99d340d95ee9b97038ca119ad46ac57b7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/modeling_tf_electra.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/tokenization_electra.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/tokenization_electra.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63f2c423161a8bf9793304356e594e837cc9fe6b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/tokenization_electra.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/tokenization_electra_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/tokenization_electra_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70404565445e8a603ed4f8615099984559e8ea02
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/electra/__pycache__/tokenization_electra_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/configuration_electra.py b/phivenv/Lib/site-packages/transformers/models/electra/configuration_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..f12756d976b35ee3a4f333483b1b4e6e1a07fb7e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/electra/configuration_electra.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ELECTRA model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ElectraConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ElectraModel`] or a [`TFElectraModel`]. It is
+    used to instantiate a ELECTRA model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the ELECTRA
+    [google/electra-small-discriminator](https://huggingface.co/google/electra-small-discriminator) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the ELECTRA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ElectraModel`] or [`TFElectraModel`].
+        embedding_size (`int`, *optional*, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ElectraModel`] or [`TFElectraModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        summary_type (`str`, *optional*, defaults to `"first"`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Pass `"gelu"` for a gelu activation to the output, any other value will result in no activation.
+        summary_last_dropout (`float`, *optional*, defaults to 0.0):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            The dropout ratio to be used after the projection and activation.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ElectraConfig, ElectraModel
+
+    >>> # Initializing a ELECTRA electra-base-uncased style configuration
+    >>> configuration = ElectraConfig()
+
+    >>> # Initializing a model (with random weights) from the electra-base-uncased style configuration
+    >>> model = ElectraModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "electra"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        embedding_size=128,
+        hidden_size=256,
+        num_hidden_layers=12,
+        num_attention_heads=4,
+        intermediate_size=1024,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation="gelu",
+        summary_last_dropout=0.1,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_last_dropout = summary_last_dropout
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class ElectraOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["ElectraConfig", "ElectraOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/modeling_electra.py b/phivenv/Lib/site-packages/transformers/models/electra/modeling_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..565cb67e05f4201365b9fe6d6ff672f99656c9b1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/electra/modeling_electra.py
@@ -0,0 +1,1586 @@
+# coding=utf-8
+# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ELECTRA model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, get_activation
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithCrossAttentions,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_electra import ElectraConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        original_name: str = name
+
+        try:
+            if isinstance(model, ElectraForMaskedLM):
+                name = name.replace("electra/embeddings/", "generator/embeddings/")
+
+            if discriminator_or_generator == "generator":
+                name = name.replace("electra/", "discriminator/")
+                name = name.replace("generator/", "electra/")
+
+            name = name.replace("dense_1", "dense_prediction")
+            name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias")
+
+            name = name.split("/")
+            # print(original_name, name)
+            # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+            # which are not required for using pretrained model
+            if any(n in ["global_step", "temperature"] for n in name):
+                logger.info(f"Skipping {original_name}")
+                continue
+            pointer = model
+            for m_name in name:
+                if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                    scope_names = re.split(r"_(\d+)", m_name)
+                else:
+                    scope_names = [m_name]
+                if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                    pointer = getattr(pointer, "weight")
+                elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                    pointer = getattr(pointer, "bias")
+                elif scope_names[0] == "output_weights":
+                    pointer = getattr(pointer, "weight")
+                elif scope_names[0] == "squad":
+                    pointer = getattr(pointer, "classifier")
+                else:
+                    pointer = getattr(pointer, scope_names[0])
+                if len(scope_names) >= 2:
+                    num = int(scope_names[1])
+                    pointer = pointer[num]
+            if m_name.endswith("_embeddings"):
+                pointer = getattr(pointer, "weight")
+            elif m_name == "kernel":
+                array = np.transpose(array)
+            try:
+                if pointer.shape != array.shape:
+                    raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+            except ValueError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            print(f"Initialize PyTorch weight {name}", original_name)
+            pointer.data = torch.from_numpy(array)
+        except AttributeError as e:
+            print(f"Skipping {original_name}", name, e)
+            continue
+    return model
+
+
+class ElectraEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Electra
+class ElectraSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = self.query(hidden_states)
+        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = self.key(current_states)
+            key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+                1, 2
+            )
+            value_layer = self.value(current_states)
+            value_layer = value_layer.view(
+                batch_size, -1, self.num_attention_heads, self.attention_head_size
+            ).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if past_key_values is not None:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ElectraModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class ElectraSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+ELECTRA_SELF_ATTENTION_CLASSES = {
+    "eager": ElectraSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra,BERT->ELECTRA
+class ElectraAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        self.self = ELECTRA_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config,
+            position_embedding_type=position_embedding_type,
+            layer_idx=layer_idx,
+        )
+        self.output = ElectraSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class ElectraIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class ElectraOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Electra
+class ElectraLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ElectraAttention(config, layer_idx=layer_idx)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = ElectraAttention(config, position_embedding_type="absolute", layer_idx=layer_idx)
+        self.intermediate = ElectraIntermediate(config)
+        self.output = ElectraOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Electra
+class ElectraEncoder(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ElectraLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and self.config.is_decoder and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class ElectraDiscriminatorPredictions(nn.Module):
+    """Prediction module for the discriminator, made up of two dense layers."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = get_activation(config.hidden_act)
+        self.dense_prediction = nn.Linear(config.hidden_size, 1)
+        self.config = config
+
+    def forward(self, discriminator_hidden_states):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = self.activation(hidden_states)
+        logits = self.dense_prediction(hidden_states).squeeze(-1)
+
+        return logits
+
+
+class ElectraGeneratorPredictions(nn.Module):
+    """Prediction module for the generator, made up of two dense layers."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.activation = get_activation("gelu")
+        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+
+    def forward(self, generator_hidden_states):
+        hidden_states = self.dense(generator_hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+
+        return hidden_states
+
+
+@auto_docstring
+class ElectraPreTrainedModel(PreTrainedModel):
+    config: ElectraConfig
+    load_tf_weights = load_tf_weights_in_electra
+    base_model_prefix = "electra"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`ElectraForPreTraining`].
+    """
+)
+class ElectraForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Total loss of the ELECTRA objective.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        Prediction scores of the head (scores for each token before SoftMax).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+class ElectraModel(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = ElectraEmbeddings(config)
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
+
+        self.encoder = ElectraEncoder(config)
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if hasattr(self, "embeddings_project"):
+            hidden_states = self.embeddings_project(hidden_states)
+
+        hidden_states = self.encoder(
+            hidden_states,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return hidden_states
+
+
+class ElectraClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.activation = get_activation("gelu")
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.activation(x)  # although BERT uses tanh here, it seems Electra authors used gelu here
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Electra
+class ElectraSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`ElectraConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+@auto_docstring(
+    custom_intro="""
+    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """
+)
+class ElectraForSequenceClassification(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.electra = ElectraModel(config)
+        self.classifier = ElectraClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        discriminator_hidden_states = self.electra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = discriminator_hidden_states[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + discriminator_hidden_states[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
+
+    It is recommended to load the discriminator checkpoint into that model.
+    """
+)
+class ElectraForPreTraining(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.discriminator_predictions = ElectraDiscriminatorPredictions(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], ElectraForPreTrainingOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring)
+            Indices should be in `[0, 1]`:
+
+            - 0 indicates the token is an original token,
+            - 1 indicates the token was replaced.
+
+        Examples:
+
+        ```python
+        >>> from transformers import ElectraForPreTraining, AutoTokenizer
+        >>> import torch
+
+        >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
+
+        >>> sentence = "The quick brown fox jumps over the lazy dog"
+        >>> fake_sentence = "The quick brown fox fake over the lazy dog"
+
+        >>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
+        >>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
+        >>> discriminator_outputs = discriminator(fake_inputs)
+        >>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)
+
+        >>> fake_tokens
+        ['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]']
+
+        >>> predictions.squeeze().tolist()
+        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        discriminator_hidden_states = self.electra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1
+                active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss]
+                active_labels = labels[active_loss]
+                loss = loss_fct(active_logits, active_labels.float())
+            else:
+                loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float())
+
+        if not return_dict:
+            output = (logits,) + discriminator_hidden_states[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ElectraForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Electra model with a language modeling head on top.
+
+    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
+    the two to have been trained for the masked language modeling task.
+    """
+)
+class ElectraForMaskedLM(ElectraPreTrainedModel):
+    _tied_weights_keys = ["generator_lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.generator_predictions = ElectraGeneratorPredictions(config)
+
+        self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.generator_lm_head
+
+    def set_output_embeddings(self, word_embeddings):
+        self.generator_lm_head = word_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        generator_hidden_states = self.electra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        generator_sequence_output = generator_hidden_states[0]
+
+        prediction_scores = self.generator_predictions(generator_sequence_output)
+        prediction_scores = self.generator_lm_head(prediction_scores)
+
+        loss = None
+        # Masked language modeling softmax layer
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
+            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + generator_hidden_states[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=generator_hidden_states.hidden_states,
+            attentions=generator_hidden_states.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Electra model with a token classification head on top.
+
+    Both the discriminator and generator may be loaded into this model.
+    """
+)
+class ElectraForTokenClassification(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.electra = ElectraModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        discriminator_hidden_states = self.electra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+
+        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
+        logits = self.classifier(discriminator_sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + discriminator_hidden_states[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+
+@auto_docstring
+class ElectraForQuestionAnswering(ElectraPreTrainedModel):
+    config: ElectraConfig
+    base_model_prefix = "electra"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.electra = ElectraModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        discriminator_hidden_states = self.electra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output = discriminator_hidden_states[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + discriminator_hidden_states[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+
+@auto_docstring
+class ElectraForMultipleChoice(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.sequence_summary = ElectraSequenceSummary(config)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        discriminator_hidden_states = self.electra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = discriminator_hidden_states[0]
+
+        pooled_output = self.sequence_summary(sequence_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + discriminator_hidden_states[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+class ElectraForCausalLM(ElectraPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["generator_lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.electra = ElectraModel(config)
+        self.generator_predictions = ElectraGeneratorPredictions(config)
+        self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.generator_lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.generator_lm_head = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
+        >>> config = ElectraConfig.from_pretrained("google/electra-base-generator")
+        >>> config.is_decoder = True
+        >>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.electra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.generator_lm_head(self.generator_predictions(sequence_output))
+
+        lm_loss = None
+        if labels is not None:
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+__all__ = [
+    "ElectraForCausalLM",
+    "ElectraForMaskedLM",
+    "ElectraForMultipleChoice",
+    "ElectraForPreTraining",
+    "ElectraForQuestionAnswering",
+    "ElectraForSequenceClassification",
+    "ElectraForTokenClassification",
+    "ElectraModel",
+    "ElectraPreTrainedModel",
+    "load_tf_weights_in_electra",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/modeling_flax_electra.py b/phivenv/Lib/site-packages/transformers/models/electra/modeling_flax_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d845476d62f9defb2de4392742037762fb959f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/electra/modeling_flax_electra.py
@@ -0,0 +1,1614 @@
+# coding=utf-8
+# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_electra import ElectraConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
+_CONFIG_FOR_DOC = "ElectraConfig"
+
+remat = nn_partitioning.remat
+
+
+@flax.struct.dataclass
+class FlaxElectraForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`ElectraForPreTraining`].
+
+    Args:
+        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: jnp.ndarray = None
+    hidden_states: Optional[tuple[jnp.ndarray]] = None
+    attentions: Optional[tuple[jnp.ndarray]] = None
+
+
+ELECTRA_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ELECTRA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+"""
+
+
+class FlaxElectraEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.embedding_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.embedding_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.embedding_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.__call__
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->Electra
+class FlaxElectraSelfAttention(nn.Module):
+    config: ElectraConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        # get query proj
+        query_states = self.query(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.key(key_value_states)
+            value_states = self.value(key_value_states)
+        else:
+            # self_attention
+            key_states = self.key(hidden_states)
+            value_states = self.value(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->Electra
+class FlaxElectraSelfOutput(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->Electra
+class FlaxElectraAttention(nn.Module):
+    config: ElectraConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.self = FlaxElectraSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
+        self.output = FlaxElectraSelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states=None,
+        init_cache=False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        attn_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            key_value_states=key_value_states,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->Electra
+class FlaxElectraIntermediate(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->Electra
+class FlaxElectraOutput(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + attention_output)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->Electra
+class FlaxElectraLayer(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.attention = FlaxElectraAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
+        self.intermediate = FlaxElectraIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxElectraOutput(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxElectraAttention(self.config, causal=False, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        # Self Attention
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=encoder_hidden_states,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attention_outputs[1],)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->Electra
+class FlaxElectraLayerCollection(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxElectraCheckpointLayer = remat(FlaxElectraLayer, static_argnums=(5, 6, 7))
+            self.layers = [
+                FlaxElectraCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxElectraLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                head_mask[i] if head_mask is not None else None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                deterministic,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->Electra
+class FlaxElectraEncoder(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.layer = FlaxElectraLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class FlaxElectraGeneratorPredictions(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dense = nn.Dense(self.config.embedding_size, dtype=self.dtype)
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class FlaxElectraDiscriminatorPredictions(nn.Module):
+    """Prediction module for the discriminator, made up of two dense layers."""
+
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
+        self.dense_prediction = nn.Dense(1, dtype=self.dtype)
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
+        hidden_states = self.dense_prediction(hidden_states).squeeze(-1)
+        return hidden_states
+
+
+class FlaxElectraPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ElectraConfig
+    base_model_prefix = "electra"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: ElectraConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.init_weights
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.zeros_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
+            )
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        past_key_values: Optional[dict] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.ones_like(input_ids)
+
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        if self.config.add_cross_attention:
+            # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+            # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+            # changed by FlaxElectraAttention module
+            if past_key_values:
+                inputs["cache"] = past_key_values
+                mutable = ["cache"]
+            else:
+                mutable = False
+
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+                mutable=mutable,
+            )
+
+            # add updated cache to model output
+            if past_key_values is not None and return_dict:
+                outputs, past_key_values = outputs
+                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+                return outputs
+            elif past_key_values is not None and not return_dict:
+                outputs, past_key_values = outputs
+                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        else:
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+            )
+
+        return outputs
+
+
+class FlaxElectraModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.embeddings = FlaxElectraEmbeddings(self.config, dtype=self.dtype)
+        if self.config.embedding_size != self.config.hidden_size:
+            self.embeddings_project = nn.Dense(self.config.hidden_size, dtype=self.dtype)
+        self.encoder = FlaxElectraEncoder(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask: Optional[np.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        embeddings = self.embeddings(
+            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
+        )
+        if hasattr(self, "embeddings_project"):
+            embeddings = self.embeddings_project(embeddings)
+
+        return self.encoder(
+            embeddings,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(
+    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top.",
+    ELECTRA_START_DOCSTRING,
+)
+class FlaxElectraModel(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraModule
+
+
+append_call_sample_docstring(FlaxElectraModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxElectraTiedDense(nn.Module):
+    embedding_size: int
+    dtype: jnp.dtype = jnp.float32
+    precision = None
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.bias = self.param("bias", self.bias_init, (self.embedding_size,))
+
+    def __call__(self, x, kernel):
+        x = jnp.asarray(x, self.dtype)
+        kernel = jnp.asarray(kernel, self.dtype)
+        y = lax.dot_general(
+            x,
+            kernel,
+            (((x.ndim - 1,), (0,)), ((), ())),
+            precision=self.precision,
+        )
+        bias = jnp.asarray(self.bias, self.dtype)
+        return y + bias
+
+
+class FlaxElectraForMaskedLMModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.electra = FlaxElectraModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.generator_predictions = FlaxElectraGeneratorPredictions(config=self.config, dtype=self.dtype)
+        if self.config.tie_word_embeddings:
+            self.generator_lm_head = FlaxElectraTiedDense(self.config.vocab_size, dtype=self.dtype)
+        else:
+            self.generator_lm_head = nn.Dense(self.config.vocab_size, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        prediction_scores = self.generator_predictions(hidden_states)
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.electra.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+            prediction_scores = self.generator_lm_head(prediction_scores, shared_embedding.T)
+        else:
+            prediction_scores = self.generator_lm_head(prediction_scores)
+
+        if not return_dict:
+            return (prediction_scores,) + outputs[1:]
+
+        return FlaxMaskedLMOutput(
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings("""Electra Model with a `language modeling` head on top.""", ELECTRA_START_DOCSTRING)
+class FlaxElectraForMaskedLM(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraForMaskedLMModule
+
+
+append_call_sample_docstring(FlaxElectraForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxElectraForPreTrainingModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.electra = FlaxElectraModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.discriminator_predictions = FlaxElectraDiscriminatorPredictions(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+
+        logits = self.discriminator_predictions(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxElectraForPreTrainingOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
+
+    It is recommended to load the discriminator checkpoint into that model.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class FlaxElectraForPreTraining(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraForPreTrainingModule
+
+
+FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxElectraForPreTraining
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
+    >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
+
+    >>> prediction_logits = outputs.logits
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxElectraForPreTraining,
+    ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxElectraForPreTraining, output_type=FlaxElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
+)
+
+
+class FlaxElectraForTokenClassificationModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.electra = FlaxElectraModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Electra model with a token classification head on top.
+
+    Both the discriminator and generator may be loaded into this model.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class FlaxElectraForTokenClassification(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraForTokenClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxElectraForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+def identity(x, **kwargs):
+    return x
+
+
+class FlaxElectraSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`PretrainedConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.summary = identity
+        if hasattr(self.config, "summary_use_proj") and self.config.summary_use_proj:
+            if (
+                hasattr(self.config, "summary_proj_to_labels")
+                and self.config.summary_proj_to_labels
+                and self.config.num_labels > 0
+            ):
+                num_classes = self.config.num_labels
+            else:
+                num_classes = self.config.hidden_size
+            self.summary = nn.Dense(num_classes, dtype=self.dtype)
+
+        activation_string = getattr(self.config, "summary_activation", None)
+        self.activation = ACT2FN[activation_string] if activation_string else lambda x: x  # noqa F407
+
+        self.first_dropout = identity
+        if hasattr(self.config, "summary_first_dropout") and self.config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(self.config.summary_first_dropout)
+
+        self.last_dropout = identity
+        if hasattr(self.config, "summary_last_dropout") and self.config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(self.config.summary_last_dropout)
+
+    def __call__(self, hidden_states, cls_index=None, deterministic: bool = True):
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`jnp.ndarray` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`jnp.ndarray` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `jnp.ndarray`: The summary of the sequence hidden states.
+        """
+        # NOTE: this does "first" type summary always
+        output = hidden_states[:, 0]
+        output = self.first_dropout(output, deterministic=deterministic)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output, deterministic=deterministic)
+        return output
+
+
+class FlaxElectraForMultipleChoiceModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.electra = FlaxElectraModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.sequence_summary = FlaxElectraSequenceSummary(config=self.config, dtype=self.dtype)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+
+        # Model
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        pooled_output = self.sequence_summary(hidden_states, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if not return_dict:
+            return (reshaped_logits,) + outputs[1:]
+
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class FlaxElectraForMultipleChoice(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraForMultipleChoiceModule
+
+
+# adapt docstring slightly for FlaxElectraForMultipleChoice
+overwrite_call_docstring(
+    FlaxElectraForMultipleChoice, ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxElectraForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxElectraForQuestionAnsweringModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.electra = FlaxElectraModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class FlaxElectraForQuestionAnswering(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraForQuestionAnsweringModule
+
+
+append_call_sample_docstring(
+    FlaxElectraForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxElectraClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic: bool = True):
+        x = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, deterministic=deterministic)
+        x = self.dense(x)
+        x = ACT2FN["gelu"](x)  # although BERT uses tanh here, it seems Electra authors used gelu
+        x = self.dropout(x, deterministic=deterministic)
+        x = self.out_proj(x)
+        return x
+
+
+class FlaxElectraForSequenceClassificationModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.electra = FlaxElectraModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.classifier = FlaxElectraClassificationHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.classifier(hidden_states, deterministic=deterministic)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Electra Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class FlaxElectraForSequenceClassification(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraForSequenceClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxElectraForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxElectraForCausalLMModule(nn.Module):
+    config: ElectraConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.electra = FlaxElectraModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.generator_predictions = FlaxElectraGeneratorPredictions(config=self.config, dtype=self.dtype)
+        if self.config.tie_word_embeddings:
+            self.generator_lm_head = FlaxElectraTiedDense(self.config.vocab_size, dtype=self.dtype)
+        else:
+            self.generator_lm_head = nn.Dense(self.config.vocab_size, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask: Optional[jnp.ndarray] = None,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        prediction_scores = self.generator_predictions(hidden_states)
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.electra.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+            prediction_scores = self.generator_lm_head(prediction_scores, shared_embedding.T)
+        else:
+            prediction_scores = self.generator_lm_head(prediction_scores)
+
+        if not return_dict:
+            return (prediction_scores,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Electra Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
+    autoregressive tasks.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLM with Bert->Electra
+class FlaxElectraForCausalLM(FlaxElectraPreTrainedModel):
+    module_class = FlaxElectraForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxElectraForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = [
+    "FlaxElectraForCausalLM",
+    "FlaxElectraForMaskedLM",
+    "FlaxElectraForMultipleChoice",
+    "FlaxElectraForPreTraining",
+    "FlaxElectraForQuestionAnswering",
+    "FlaxElectraForSequenceClassification",
+    "FlaxElectraForTokenClassification",
+    "FlaxElectraModel",
+    "FlaxElectraPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/modeling_tf_electra.py b/phivenv/Lib/site-packages/transformers/models/electra/modeling_tf_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5c33e503d7386df5c2be0fc10a079ee4fe014a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/electra/modeling_tf_electra.py
@@ -0,0 +1,1775 @@
+# coding=utf-8
+# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF Electra model."""
+
+from __future__ import annotations
+
+import math
+import warnings
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_electra import ElectraConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
+_CONFIG_FOR_DOC = "ElectraConfig"
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra
+class TFElectraSelfAttention(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+        self.is_decoder = config.is_decoder
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra
+class TFElectraSelfOutput(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra
+class TFElectraAttention(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFElectraSelfAttention(config, name="self")
+        self.dense_output = TFElectraSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        # add attentions (possibly with past_key_value) if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra
+class TFElectraIntermediate(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra
+class TFElectraOutput(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra
+class TFElectraLayer(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFElectraAttention(config, name="attention")
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = TFElectraAttention(config, name="crossattention")
+        self.intermediate = TFElectraIntermediate(config, name="intermediate")
+        self.bert_output = TFElectraOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: tuple[tf.Tensor] | None,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=self_attn_past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                input_tensor=attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        intermediate_output = self.intermediate(hidden_states=attention_output)
+        layer_output = self.bert_output(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + outputs  # add attentions if we output them
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra
+class TFElectraEncoder(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer = [TFElectraLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None,
+        use_cache: bool | None,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra
+class TFElectraPooler(keras.layers.Layer):
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra
+class TFElectraEmbeddings(keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: ElectraConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.embedding_size = config.embedding_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        past_key_values_length=0,
+        training: bool = False,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(
+                tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
+            )
+
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+class TFElectraDiscriminatorPredictions(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(config.hidden_size, name="dense")
+        self.dense_prediction = keras.layers.Dense(1, name="dense_prediction")
+        self.config = config
+
+    def call(self, discriminator_hidden_states, training=False):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = get_tf_activation(self.config.hidden_act)(hidden_states)
+        logits = tf.squeeze(self.dense_prediction(hidden_states), -1)
+
+        return logits
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "dense_prediction", None) is not None:
+            with tf.name_scope(self.dense_prediction.name):
+                self.dense_prediction.build([None, None, self.config.hidden_size])
+
+
+class TFElectraGeneratorPredictions(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dense = keras.layers.Dense(config.embedding_size, name="dense")
+        self.config = config
+
+    def call(self, generator_hidden_states, training=False):
+        hidden_states = self.dense(generator_hidden_states)
+        hidden_states = get_tf_activation("gelu")(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFElectraPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ElectraConfig
+    base_model_prefix = "electra"
+    # When the model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"generator_lm_head.weight"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+
+@keras_serializable
+class TFElectraMainLayer(keras.layers.Layer):
+    config_class = ElectraConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.is_decoder = config.is_decoder
+
+        self.embeddings = TFElectraEmbeddings(config, name="embeddings")
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")
+
+        self.encoder = TFElectraEncoder(config, name="encoder")
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    def get_extended_attention_mask(self, attention_mask, input_shape, dtype, past_key_values_length=0):
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask_shape = shape_list(attention_mask)
+
+        mask_seq_length = seq_length + past_key_values_length
+        # Copied from `modeling_tf_t5.py`
+        # Provided a padding mask of dimensions [batch_size, mask_seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+        if self.is_decoder:
+            seq_ids = tf.range(mask_seq_length)
+            causal_mask = tf.less_equal(
+                tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                seq_ids[None, :, None],
+            )
+            causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
+            extended_attention_mask = causal_mask * attention_mask[:, None, :]
+            attention_mask_shape = shape_list(extended_attention_mask)
+            extended_attention_mask = tf.reshape(
+                extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
+            )
+            if past_key_values_length > 0:
+                extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
+        else:
+            extended_attention_mask = tf.reshape(
+                attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=dtype)
+        one_cst = tf.constant(1.0, dtype=dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        return extended_attention_mask
+
+    def get_head_mask(self, head_mask):
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        return head_mask
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        if not self.config.is_decoder:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_key_values_length = 0
+            past_key_values = [None] * len(self.encoder.layer)
+        else:
+            past_key_values_length = shape_list(past_key_values[0][0])[-2]
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            training=training,
+        )
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, hidden_states.dtype, past_key_values_length
+        )
+
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.is_decoder and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        head_mask = self.get_head_mask(head_mask)
+
+        if hasattr(self, "embeddings_project"):
+            hidden_states = self.embeddings_project(hidden_states, training=training)
+
+        hidden_states = self.encoder(
+            hidden_states=hidden_states,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "embeddings_project", None) is not None:
+            with tf.name_scope(self.embeddings_project.name):
+                self.embeddings_project.build([None, None, self.config.embedding_size])
+
+
+@dataclass
+class TFElectraForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`TFElectraForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
+            Total loss of the ELECTRA objective.
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Prediction scores of the head (scores for each token before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+ELECTRA_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ELECTRA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
+    "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
+    "hidden size and embedding size are different. "
+    ""
+    "Both the generator and discriminator checkpoints may be loaded into this model.",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraModel(TFElectraPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        """
+        outputs = self.electra(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "electra", None) is not None:
+            with tf.name_scope(self.electra.name):
+                self.electra.build(None)
+
+
+@add_start_docstrings(
+    """
+    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
+
+    Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model
+    of the two to have the correct classification head to be used for this model.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForPreTraining(TFElectraPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFElectraForPreTrainingOutput | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoTokenizer, TFElectraForPreTraining
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
+        >>> model = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> scores = outputs[0]
+        ```"""
+        discriminator_hidden_states = self.electra(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+
+        if not return_dict:
+            return (logits,) + discriminator_hidden_states[1:]
+
+        return TFElectraForPreTrainingOutput(
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "electra", None) is not None:
+            with tf.name_scope(self.electra.name):
+                self.electra.build(None)
+        if getattr(self, "discriminator_predictions", None) is not None:
+            with tf.name_scope(self.discriminator_predictions.name):
+                self.discriminator_predictions.build(None)
+
+
+class TFElectraMaskedLMHead(keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.embedding_size = config.embedding_size
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Electra model with a language modeling head on top.
+
+    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
+    the two to have been trained for the masked language modeling task.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.config = config
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
+
+        if isinstance(config.hidden_act, str):
+            self.activation = get_tf_activation(config.hidden_act)
+        else:
+            self.activation = config.hidden_act
+
+        self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
+
+    def get_lm_head(self):
+        return self.generator_lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.generator_lm_head.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="google/electra-small-generator",
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="[MASK]",
+        expected_output="'paris'",
+        expected_loss=1.22,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        generator_hidden_states = self.electra(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        generator_sequence_output = generator_hidden_states[0]
+        prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
+        prediction_scores = self.generator_lm_head(prediction_scores, training=training)
+        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + generator_hidden_states[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=generator_hidden_states.hidden_states,
+            attentions=generator_hidden_states.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "electra", None) is not None:
+            with tf.name_scope(self.electra.name):
+                self.electra.build(None)
+        if getattr(self, "generator_predictions", None) is not None:
+            with tf.name_scope(self.generator_predictions.name):
+                self.generator_predictions.build(None)
+        if getattr(self, "generator_lm_head", None) is not None:
+            with tf.name_scope(self.generator_lm_head.name):
+                self.generator_lm_head.build(None)
+
+
+class TFElectraClassificationHead(keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        classifier_dropout = (
+            config.classifhidden_dropout_probier_dropout
+            if config.classifier_dropout is not None
+            else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.out_proj = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
+        self.config = config
+
+    def call(self, inputs, **kwargs):
+        x = inputs[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = get_tf_activation("gelu")(x)  # although BERT uses tanh here, it seems Electra authors used gelu here
+        x = self.dropout(x)
+        x = self.out_proj(x)
+
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.classifier = TFElectraClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="bhadresh-savani/electra-base-emotion",
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'joy'",
+        expected_loss=0.06,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.electra(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        logits = self.classifier(outputs[0])
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "electra", None) is not None:
+            with tf.name_scope(self.electra.name):
+                self.electra.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.sequence_summary = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="sequence_summary"
+        )
+        self.classifier = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.electra(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        logits = self.sequence_summary(outputs[0])
+        logits = self.classifier(logits)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "electra", None) is not None:
+            with tf.name_scope(self.electra.name):
+                self.electra.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    Electra model with a token classification head on top.
+
+    Both the discriminator and generator may be loaded into this model.
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english",
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="['B-LOC', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC']",
+        expected_loss=0.11,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        discriminator_hidden_states = self.electra(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
+        logits = self.classifier(discriminator_sequence_output)
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + discriminator_hidden_states[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "electra", None) is not None:
+            with tf.name_scope(self.electra.name):
+                self.electra.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    Electra Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="bhadresh-savani/electra-base-squad2",
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=11,
+        qa_target_end_index=12,
+        expected_output="'a nice puppet'",
+        expected_loss=2.64,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        discriminator_hidden_states = self.electra(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        logits = self.qa_outputs(discriminator_sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+        loss = None
+
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + discriminator_hidden_states[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "electra", None) is not None:
+            with tf.name_scope(self.electra.name):
+                self.electra.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFElectraForMaskedLM",
+    "TFElectraForMultipleChoice",
+    "TFElectraForPreTraining",
+    "TFElectraForQuestionAnswering",
+    "TFElectraForSequenceClassification",
+    "TFElectraForTokenClassification",
+    "TFElectraModel",
+    "TFElectraPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/tokenization_electra.py b/phivenv/Lib/site-packages/transformers/models/electra/tokenization_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8971dd6f40374e1d8a6e8ec479cd9da79b64da3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/electra/tokenization_electra.py
@@ -0,0 +1,482 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->Electra,BERT->Electra
+class ElectraTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a Electra tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original Electra).
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        clean_up_tokenization_spaces=True,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = ElectraTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text, split_special_tokens=False):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A Electra sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["ElectraTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/electra/tokenization_electra_fast.py b/phivenv/Lib/site-packages/transformers/models/electra/tokenization_electra_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..db0285581ed1eea5b903a3bed573bbf6408e0167
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/electra/tokenization_electra_fast.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from .tokenization_electra import ElectraTokenizer
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->Electra , BERT->ELECTRA
+class ElectraTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original ELECTRA).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = ElectraTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A ELECTRA sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["ElectraTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/__init__.py b/phivenv/Lib/site-packages/transformers/models/emu3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8555f58d1866451c38abb5559ef5bef9545f0b0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/emu3/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_emu3 import *
+    from .image_processing_emu3 import *
+    from .modeling_emu3 import *
+    from .processing_emu3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19f134b11de2e7eaa9280e15481734774babd099
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04d343d0289d5ce8f9a56a1628cd37cf7c9fe174
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6d511a2c6446108df3de6bd89bbbd2527b115e9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fde5974570cefc4c3936b59066a3386f0fb40ee
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b8449acac31821ac4a086b98793b7cd533adf55
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22f84baaef9fab6300497bcb3b0f628b4a2bb29e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/configuration_emu3.py b/phivenv/Lib/site-packages/transformers/models/emu3/configuration_emu3.py
new file mode 100644
index 0000000000000000000000000000000000000000..35c83937c8b5adbce69f70d74233bceff5d8d9b9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/emu3/configuration_emu3.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class Emu3VQVAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Emu3VQVAE`]. It is used to instantiate an VQ-VAE
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the VQ model presented in Emu3 paper.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        codebook_size (`int`, *optional*, defaults to 32768):
+            Codebook size of the VQ model.
+        embed_dim (`int`, *optional*, defaults to 4):
+            Dimension of the quantized vector in codebook.
+        latent_channels (`int`, *optional*, defaults to 4):
+            Dimension of the output channel of encoder and the input channel of decoder
+        double_latent (`bool`, *optional*, defaults to `False`):
+            Whether double the output dim of the encoder.
+        in_channels (`int`, *optional*, defaults to 3):
+            Input channel of encoder.
+        out_channels (`int`, *optional*, defaults to 3):
+            Output channel of decoder.
+        temporal_downsample_factor (`int`, *optional*, defaults to 4):
+            Temporal downsample factor.
+        base_channels (`int`, *optional*, defaults to 256):
+            Basic channel number of the intermediate blocks.
+        channel_multiplier (`list[int]`, *optional*, defaults to `[1, 2, 2, 4]`):
+            Channel scaling factor of the intermediate blocks.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Residual block number in each stage.
+        attn_resolutions (`list[int]`, *optional*, defaults to `[3]`):
+            Stage indices to apply attention.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations in the attention layer.
+        num_attention_heads (`int`, *optional*, defaults to 1):
+            Number of attention heads for each attention layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Emu3VQVAE, Emu3VQVAEConfig
+
+    >>> # Initializing a video VQ model of Emu3 configuration
+    >>> configuration = Emu3VQVAEConfig()
+
+    >>> # Initializing a model from the Emu3 VQ model style configuration
+    >>> model = Emu3VQVAE(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "emu3_vqgan"
+    base_config_key = "vq_config"
+
+    def __init__(
+        self,
+        codebook_size: int = 32768,
+        embed_dim: int = 4,
+        latent_channels: int = 4,
+        double_latent: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        temporal_downsample_factor: int = 4,
+        base_channels: int = 256,
+        channel_multiplier: list[int] = [1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        attn_resolutions: list[int] = [3],
+        hidden_size: int = 1024,
+        num_attention_heads: int = 1,
+        attention_dropout: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.codebook_size = codebook_size
+        self.embed_dim = embed_dim
+        self.latent_channels = latent_channels
+        self.double_latent = double_latent
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.temporal_downsample_factor = temporal_downsample_factor
+        self.base_channels = base_channels
+        self.channel_multiplier = channel_multiplier
+        self.num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+
+
+class Emu3TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Emu3TextModel`]. It is used to instantiate a
+    emu3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [Emu3-community/Emu3-Chat-hf](https://huggingface.co/Emu3-community/Emu3-Chat-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 184622):
+            Vocabulary size of the Emu3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Emu3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 9216):
+            The maximum sequence length that this model might ever be used with. Emu supports up to 9216 tokens,
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 151643):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 151849):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 151850):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    ```python
+    >>> from transformers import Emu3Model, Emu3Config
+
+    >>> # Initializing a Emu3-community/Emu3-Chat-hf style configuration
+    >>> configuration = Emu3Config()
+
+    >>> # Initializing a model from the Emu3-community/Emu3-Chat-hf style configuration
+    >>> model = Emu3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "emu3_text_model"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 184622,
+        hidden_size: int = 4096,
+        intermediate_size: int = 14336,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: Optional[int] = 8,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 9216,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        pad_token_id: int = 151643,
+        bos_token_id: int = 151849,
+        eos_token_id: int = 151850,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 1000000.0,
+        rope_scaling: Optional = None,
+        mlp_bias=False,
+        attention_bias=False,
+        attention_dropout: float = 0.1,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.mlp_bias = mlp_bias
+        self.attention_bias = attention_bias
+        self.initializer_range = initializer_range
+        rope_config_validation(self)
+
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class Emu3Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Emu3Model`]. It is used to instantiate a
+    emu3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [Emu3-community/Emu3-Chat-hf](https://huggingface.co/Emu3-community/Emu3-Chat-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vq_config (`Union[Dict, Emu3VQVAEConfig]`, *optional*):
+            Emu3VQVAEConfig instance containing the configuration for the VQ-VAE model.
+        text_config (`Union[Dict, Emu3TextConfig]``, *optional*):
+            Emu3TextConfig instance containing the configuration for the language model.
+        vocabulary_map (`dict`, *optional*):
+            A dictionary containing the vocabulary map from the tokenizer. Used to obtain tokens from the image inputs.
+    """
+
+    model_type = "emu3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"text_config": Emu3TextConfig, "vq_config": Emu3VQVAEConfig}
+
+    def __init__(
+        self,
+        vq_config: Union[dict, Emu3VQVAEConfig] = None,
+        text_config: Union[dict, Emu3TextConfig] = None,
+        vocabulary_map: Optional[dict[int, int]] = None,
+        **kwargs,
+    ):
+        if vq_config is None:
+            vq_config = Emu3VQVAEConfig()
+        elif isinstance(vq_config, dict):
+            vq_config = Emu3VQVAEConfig(**vq_config)
+
+        if text_config is None:
+            text_config = Emu3TextConfig()
+        elif isinstance(text_config, dict):
+            text_config = Emu3TextConfig(**text_config)
+
+        self.vq_config = vq_config
+        self.text_config = text_config
+        self.vocabulary_map = vocabulary_map
+        self.image_token_id = vocabulary_map.get("<image>") if vocabulary_map is not None else None
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["Emu3Config", "Emu3TextConfig", "Emu3VQVAEConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/image_processing_emu3.py b/phivenv/Lib/site-packages/transformers/models/emu3/image_processing_emu3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a480351307ab7b346d17d85d6072ee2cd32eb09
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/emu3/image_processing_emu3.py
@@ -0,0 +1,551 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import convert_to_rgb, pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+if is_vision_available():
+    from PIL import Image
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched_images(images) -> list[list[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+
+    Args:
+        images (`Union[list[list[ImageInput]], list[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class Emu3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Emu3 image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_pad (`bool`, *optional*, defaults to `True`):
+                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+        min_pixels (`int`, *optional*, defaults to `512 * 512`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
+            The max pixels of the image to resize the image.
+        spatial_factor (`int`, *optional*, defaults to 8):
+            The spatial downsample factor the image will be downsampled in feature extracting phase
+    """
+
+    model_input_names = ["pixel_values", "image_sizes"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        do_pad: bool = True,
+        min_pixels: int = 512 * 512,
+        max_pixels: int = 1024 * 1024,
+        spatial_factor: int = 8,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.spatial_factor = spatial_factor
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`list[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.spatial_factor,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        images = np.array(processed_images)
+        return images
+
+    def _pad_for_batching(
+        self,
+        pixel_values: list[np.ndarray],
+        image_sizes: list[list[int]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
+
+        Args:
+            pixel_values (`list[np.ndarray]`):
+                An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
+            image_sizes (`list[list[int]]`):
+                A list of sizes for each image in `pixel_values` in (height, width) format.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+
+        Returns:
+            list[`np.ndarray`]: The padded images.
+        """
+
+        max_shape = (
+            max([size[0] for size in image_sizes]),
+            max([size[1] for size in image_sizes]),
+        )
+        pixel_values = [
+            pad(
+                image,
+                padding=((0, max_shape[0] - size[0]), (0, max_shape[1] - size[1])),
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image, size in zip(pixel_values, image_sizes)
+        ]
+        return pixel_values
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_pad: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_pad (`bool`, *optional*, defaults to `True`):
+                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pad = do_pad if do_pad is not None else self.do_pad
+
+        if images is not None:
+            images = self.fetch_images(images)
+            images = make_batched_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        pixel_values = []
+        for image in images:
+            image = self._preprocess(
+                image,
+                do_resize=do_resize,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                do_convert_rgb=do_convert_rgb,
+                input_data_format=input_data_format,
+            )
+            pixel_values.extend(image)
+
+        image_sizes = [image.shape[-2:] for image in pixel_values]
+        if do_pad:
+            pixel_values = self._pad_for_batching(pixel_values, image_sizes)
+            pixel_values = np.array(pixel_values)
+
+        return BatchFeature(
+            data={"pixel_values": pixel_values, "image_sizes": image_sizes}, tensor_type=return_tensors
+        )
+
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Union[str, TensorType] = "PIL.Image.Image",
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Postprocess an image or batch of images tensor. Postprocess is the reverse process of preprocess.
+        The parameters should be same as in preprocess.
+        Args:
+            images (`ImageInput`):
+                Image to postprocess. Expects a single or batch of images with pixel values ranging from -1 to 1.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)
+        if isinstance(images[0], Image.Image):
+            return images if len(images) > 1 else images[0]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        pixel_values = []
+        for image in images:
+            image = to_numpy_array(image)
+            if do_normalize:
+                image = self.unnormalize(
+                    image=image, image_mean=image_mean, image_std=image_std, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+                image = image.clip(0, 255).astype(np.uint8)
+
+            if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+                image = to_channel_dimension_format(image, ChannelDimension.LAST, input_channel_dim=input_data_format)
+                pixel_values.append(Image.fromarray(image))
+            else:
+                pixel_values.extend(image)
+
+        data = {"pixel_values": pixel_values}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def unnormalize(
+        self,
+        image: np.array,
+        image_mean: Union[float, Iterable[float]],
+        image_std: Union[float, Iterable[float]],
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+        image = (image * image_std) + image_mean
+        Args:
+            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
+                Batch of pixel values to postprocess.
+            image_mean (`float` or `Iterable[float]`):
+                The mean to use for unnormalization.
+            image_std (`float` or `Iterable[float]`):
+                The standard deviation to use for unnormalization.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        num_channels = 3
+
+        if isinstance(image_mean, Iterable):
+            if len(image_mean) != num_channels:
+                raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(image_mean)}")
+        else:
+            image_mean = [image_mean] * num_channels
+
+        if isinstance(image_std, Iterable):
+            if len(image_std) != num_channels:
+                raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(image_std)}")
+        else:
+            image_std = [image_std] * num_channels
+
+        rev_image_mean = tuple(-mean / std for mean, std in zip(image_mean, image_std))
+        rev_image_std = tuple(1 / std for std in image_std)
+        image = self.normalize(
+            image=image, mean=rev_image_mean, std=rev_image_std, input_data_format=input_data_format
+        )
+        return image
+
+
+__all__ = ["Emu3ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/modeling_emu3.py b/phivenv/Lib/site-packages/transformers/models/emu3/modeling_emu3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d50f85283fb0e2fc78943126cb61b017355dfed8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/emu3/modeling_emu3.py
@@ -0,0 +1,1638 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/emu3/modular_emu3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_emu3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from functools import cached_property
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Emu3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Emu3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Emu3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Emu3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Emu3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Emu3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Emu3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Emu3Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Emu3MLP(config)
+        self.input_layernorm = Emu3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Emu3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.dropout(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.dropout(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEVectorQuantizer(nn.Module):
+    """
+    A module for vector quantization using learned embedding vectors.
+
+    This module implements the quantization process similar to te one described in
+    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
+    input vectors into discrete codebook vectors, which are learned during training.
+    Current implementation improves over previous ones by avoiding costly matrix multiplications
+    and allowing for post-hoc remapping of indices.
+    """
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+        self.embedding = nn.Embedding(config.codebook_size, config.embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / config.codebook_size, 1.0 / config.codebook_size)
+
+    def forward(self, hidden_state: torch.Tensor):
+        batch_size, temporal, channels, height, width = hidden_state.shape
+        hidden_state = hidden_state.permute(0, 1, 3, 4, 2).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, channels)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        hidden_state_sum = torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
+        embedding_sum = torch.sum(self.embedding.weight**2, dim=1)
+
+        # "bd,dn->bn",
+        distances = 2 * torch.matmul(hidden_state_flattened, self.embedding.weight.transpose(0, 1))
+        distances = hidden_state_sum + embedding_sum - distances
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        min_encoding_indices = min_encoding_indices.view(batch_size, temporal, height, width)
+        return min_encoding_indices
+
+
+class Emu3VQVAEEncoderConvDownsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, hidden_states):
+        # no asymmetric padding in torch conv, must do it ourselves
+        hidden_states = F.pad(hidden_states, pad=(0, 1, 0, 1), mode="constant", value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEEncoderConvUpsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_states):
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+        kernel_size: tuple[int],
+        stride: tuple[int],
+    ):
+        super().__init__()
+
+        padding_sizes = [one_kernel - one_stride for one_kernel, one_stride in zip(kernel_size[1:], stride[1:])]
+        self.padding = ()
+        for pad_size in padding_sizes[::-1]:
+            self.padding += (pad_size // 2 + pad_size % 2, pad_size // 2)
+        self.padding += (2, 0)
+
+        self.conv = nn.Conv3d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = F.pad(hidden_states, self.padding)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAESpatialNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(
+            num_channels=out_channels,
+            num_groups=32,
+            eps=1e-6,
+            affine=True,
+        )
+
+        self.conv_y = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv_b = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        quant_states = F.interpolate(quant_states, size=hidden_states.shape[-2:], mode="nearest")
+        hidden_states = self.norm_layer(hidden_states)
+        hidden_states = hidden_states * self.conv_y(quant_states) + self.conv_b(quant_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, channels, temporal, height, width = hidden_states.shape
+        hidden_states = hidden_states.permute(0, 1, 3, 4, 2).contiguous().view(batch_size, -1, temporal)
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = hidden_states.view(batch_size, channels, height, width, -1).permute(0, 1, 4, 2, 3).contiguous()
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(4, 3, 3),
+            stride=(2, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        self.norm1 = nn.BatchNorm3d(in_channels)
+        self.conv1 = Emu3VQVAEConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        self.norm2 = nn.BatchNorm3d(out_channels)
+        self.conv2 = Emu3VQVAEConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        quant_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.quant_channels = quant_channels
+
+        if quant_channels is None:
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.norm1 = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+            self.norm2 = Emu3VQVAESpatialNorm(quant_channels, out_channels)
+
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states: torch.Tensor, quant_channels: Optional[torch.Tensor] = None):
+        norm_args = () if self.quant_channels is None else (quant_channels,)
+
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEAttentionBlock(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        # for compatibility with the attention interface
+        self.num_key_value_groups = 1
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class Emu3VQVAEGroupNorm(nn.GroupNorm):
+    """
+    Same as the torch GroupNorm with the only difference that this ones accepts
+    an optional kwarg `quant_states` which is not used. This class makes it easier to
+    use SpatialNorm or GroupNorm without conditionals
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def forward(self, input, quant_states=None):
+        return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
+
+
+class Emu3VQVAEMiddleBlock(nn.Module):
+    def __init__(self, config, in_channels, quant_channels=None):
+        super().__init__()
+
+        self.block_1 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+        self.attn_1 = Emu3VQVAEAttentionBlock(config)
+        if quant_channels is None:
+            self.attn_norm = Emu3VQVAEGroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.attn_norm = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+
+        self.block_2 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: Optional[torch.FloatTensor] = None):
+        hidden_states = self.block_1(hidden_states, quant_states)
+        residual = hidden_states
+        hidden_states = self.attn_norm(hidden_states, quant_states)
+        batch_size, channels, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+        hidden_states = self.attn_1(hidden_states)[0]
+        hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+        hidden_states = residual + hidden_states
+        hidden_states = self.block_2(hidden_states, quant_states)
+        return hidden_states
+
+
+class Emu3VQVAEDownBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        channel_multiplier = config.channel_multiplier
+
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if config.attn_resolutions is not None and i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(nn.GroupNorm(num_channels=block_in, num_groups=32, eps=1e-6, affine=True))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            down.attn_norms = attn_norms
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Emu3VQVAEEncoderConvDownsample(block_in)
+            self.down.append(down)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.down):
+            for i_block in range(self.num_res_blocks):
+                hidden_states = blocks.block[i_block](hidden_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+
+            if i_level != self.num_resolutions - 1:
+                hidden_states = blocks.downsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEUpBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_out = config.base_channels * config.channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        quant_channels=quant_channels,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(Emu3VQVAESpatialNorm(quant_channels, block_in))
+
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            up.attn_norms = attn_norms
+            if i_level != 0:
+                up.upsample = Emu3VQVAEEncoderConvUpsample(block_in)
+
+            self.up.insert(0, up)
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.up[::-1]):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden_states = blocks.block[i_block](hidden_states, quant_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states, quant_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+            if i_level != len(self.up) - 1:
+                hidden_states = blocks.upsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        base_channels = config.base_channels
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+        out_channels = 2 * latent_channels if double_latent else latent_channels
+        block_in = base_channels * channel_multiplier[-1]
+
+        self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1)
+        self.down_block = Emu3VQVAEDownBlock(config)
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in)
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        temporal_down_blocks = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        self.time_res_stack = nn.ModuleList()
+
+        for i in range(temporal_down_blocks):
+            conv = Emu3VQVAETemporalDownsample(out_channels, out_channels)
+            self.time_conv.append(conv)
+
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=out_channels,
+                out_channels=out_channels,
+            )
+            self.time_res_stack.append(time_res_conv)
+
+    def forward(self, pixel_values: torch.LongTensor):
+        temporal_dim = pixel_values.shape[1]
+        pixel_values = pixel_values.reshape(-1, *pixel_values.shape[2:])
+
+        # downsampling & middle
+        hidden_states = self.conv_in(pixel_values)
+        hidden_states = self.down_block(hidden_states)
+        hidden_states = self.middle_block(hidden_states)
+
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(-1, temporal_dim, *hidden_states.shape[1:])
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for conv in self.time_conv:
+            hidden_states = conv(hidden_states)
+            hidden_states *= torch.sigmoid(hidden_states)
+
+        for layer in self.time_res_stack:
+            hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        return hidden_states
+
+
+class Emu3VQVAEDecoder(nn.Module):
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+        self.time_res_stack = nn.ModuleList()
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=config.latent_channels, out_channels=config.latent_channels
+            )
+            self.time_res_stack.append(time_res_conv)
+
+        temp_upsample_block_num = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        for i in range(temp_upsample_block_num):
+            conv = Emu3VQVAETemporalUpsample(config.latent_channels, config.latent_channels)
+            self.time_conv.append(conv)
+
+        self.conv_in = nn.Conv2d(
+            config.latent_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in, quant_channels=quant_channels)
+        self.up_block = Emu3VQVAEUpBlock(config)
+
+        block_in = config.base_channels * config.channel_multiplier[0]
+        self.norm_out = Emu3VQVAESpatialNorm(quant_channels, block_in)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        hidden_quant_states = torch.cat((hidden_states, quant_states), dim=0)
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for layer in self.time_res_stack:
+            hidden_quant_states = layer(hidden_quant_states)
+
+        for layer in self.time_conv:
+            hidden_quant_states = layer(hidden_quant_states)
+            hidden_quant_states *= torch.sigmoid(hidden_quant_states)
+
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+        hidden_states, quant_states = torch.chunk(hidden_quant_states, 2, dim=0)
+        hidden_states = hidden_states.reshape(-1, *hidden_states.shape[2:])
+        quant_states = quant_states.reshape(-1, *quant_states.shape[2:])
+
+        hidden_states = self.conv_in(hidden_states)
+
+        # middle & upsampling
+        hidden_states = self.middle_block(hidden_states, quant_states)
+        hidden_states = self.up_block(hidden_states, quant_states)
+
+        hidden_states = self.norm_out(hidden_states, quant_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
+    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
+    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
+    Taigman](https://huggingface.co/papers/2203.13131).
+    """
+)
+class Emu3VQVAE(PreTrainedModel):
+    config: Emu3VQVAEConfig
+    base_model_prefix = "emuvideovq"
+    main_input_name = "pixel_values"
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _no_split_modules = [
+        "Emu3VQVAETemporalResnetBlock",
+        "Emu3VQVAEAttentionBlock",
+        "Emu3VQVAEResnetBlock",
+        "Emu3VQVAEVectorQuantizer",
+    ]
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in)
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1.0)
+            nn.init.constant_(module.bias, 0.0)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_()
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__(config)
+
+        self.config = config
+
+        self.encoder = Emu3VQVAEEncoder(config)
+        self.decoder = Emu3VQVAEDecoder(config)
+        self.quantize = Emu3VQVAEVectorQuantizer(config)
+        self.vision_spatial_factor = 2 ** (len(config.channel_multiplier) - 1)
+
+        self.quant_conv = Emu3VQVAEConv3d(
+            config.latent_channels, config.embed_dim, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.post_quant_conv = Emu3VQVAEConv3d(
+            config.embed_dim, config.latent_channels, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.spatial_scale_factor = 2 ** (len(config.channel_multiplier) - 1)
+        self.eval()  # Emu3's VQ model is frozen
+
+        self.post_init()
+
+    def encode(self, pixel_values: torch.Tensor, image_sizes: torch.Tensor):
+        is_image = pixel_values.ndim == 4
+        if is_image:
+            temporal = self.config.temporal_downsample_factor
+            batch_size, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.unsqueeze(1).repeat(1, temporal, 1, 1, 1)
+        else:
+            batch_size, temporal, channels, height, width = pixel_values.shape
+
+        hidden_states = self.encoder(pixel_values)
+
+        # b t c h w -> b c t h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        hidden_states = self.quant_conv(hidden_states)
+
+        # b c t h w -> b t c h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        codes = self.quantize(hidden_states)
+
+        image_tokens = codes.squeeze(1) if is_image else codes
+
+        image_tokens = [
+            single_image[: int(size[0] / self.vision_spatial_factor), : int(size[1] / self.vision_spatial_factor)]
+            for single_image, size in zip(image_tokens, image_sizes)
+        ]
+
+        return image_tokens
+
+    def decode(self, hidden_states: torch.Tensor):
+        is_image = hidden_states.ndim == 3
+        if is_image:
+            hidden_states = hidden_states.unsqueeze(1)
+
+        batch_size, temporal, height, width = hidden_states.shape
+        quant = self.quantize.embedding(hidden_states.flatten())
+
+        channels = quant.shape[-1]
+        quant = quant.view(batch_size, temporal, height, width, channels).permute(0, 4, 1, 2, 3).contiguous()
+        post_quant = self.post_quant_conv(quant)
+
+        quant = quant.permute(0, 2, 1, 3, 4)
+        post_quant = post_quant.permute(0, 2, 1, 3, 4)
+
+        video = self.decoder(post_quant, quant)
+        video = video.reshape(
+            batch_size,
+            temporal * self.config.temporal_downsample_factor,
+            self.config.out_channels,
+            height * self.spatial_scale_factor,
+            width * self.spatial_scale_factor,
+        )
+        return video[:, 0] if is_image else video
+
+
+class Emu3ImageVocabularyMapping:
+    """
+    A class for mapping discrete image tokens from VQGAN to BPE tokens.
+    """
+
+    def __init__(self, vocab_map):
+        self.vocab_map = vocab_map
+        self.eol_token_id = vocab_map.get("<|extra_200|>")
+        self.image_token_id = vocab_map.get("<image>")
+
+    @cached_property
+    def image_tokens(self):
+        return sorted([val for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def image_tokens_str(self):
+        return sorted([name for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def img2bpe(self):
+        return {int(token[-8:-2]): self.vocab_map[token] for token in self.image_tokens_str}
+
+    @cached_property
+    def bpe2img(self):
+        return {v: k for k, v in self.img2bpe.items()}
+
+    @cached_property
+    def bpe2img_mapping_tensor(self):
+        mapping = torch.zeros(max(self.bpe2img.keys()) + 1, dtype=torch.int)
+        for k, v in self.bpe2img.items():
+            mapping[k] = v
+        return mapping
+
+    @cached_property
+    def img2bpe_mapping_tensor(self):
+        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+
+    def convert_img2bpe(self, img_batch: list[torch.Tensor]) -> torch.Tensor:
+        device = img_batch.device
+        eol_row = torch.ones((img_batch.shape[0], 1), dtype=torch.int) * self.eol_token_id
+        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
+        img_tokens = torch.cat([img_tokens, eol_row], dim=-1)
+        return img_tokens.to(device)
+
+    def convert_bpe2img(self, img_batch: torch.Tensor) -> torch.Tensor:
+        device = img_batch.device
+        img_batch = img_batch[..., :-1]  # remove last row of EOL tokens
+        img_tokens = self.bpe2img_mapping_tensor[img_batch.to("cpu")]
+        return img_tokens.to(device)
+
+
+@auto_docstring
+class Emu3PreTrainedModel(PreTrainedModel):
+    config: Emu3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Emu3DecoderLayer",
+    ]
+    _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_param_buffer_assignment = False
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+class Emu3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Emu3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class Emu3TextModel(Emu3PreTrainedModel):
+    _can_record_outputs = {
+        "hidden_states": Emu3DecoderLayer,
+        "attentions": Emu3Attention,
+    }
+
+    def __init__(self, config: Emu3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Emu3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Emu3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Emu3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config: Emu3TextConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Emu3TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
+
+        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Emu3Model(Emu3PreTrainedModel):
+    _checkpoint_conversion_mapping = {"text_model.model": "text_model"}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_model = Emu3TextModel._from_config(config.text_config)
+        self.vqmodel = Emu3VQVAE(config.vq_config)
+        self.vocabulary_mapping = Emu3ImageVocabularyMapping(config.vocabulary_map)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.text_model = decoder
+
+    def get_decoder(self):
+        return self.text_model
+
+    def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module. Converts
+        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
+        special tokens.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+                The sizes of the images in the batch, being (height, width) for each image.
+        """
+        image_tokens_list = self.vqmodel.encode(pixel_values, image_sizes)
+        bpe_tokens_list = [self.vocabulary_mapping.convert_img2bpe(tokens).flatten() for tokens in image_tokens_list]
+        bpe_tokens = torch.cat(bpe_tokens_list)
+        return bpe_tokens
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module and embeds
+        them with text embeddings layer
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+                The tensors corresponding to the input images.
+        """
+        image_tokens = self.get_image_tokens(pixel_values, image_sizes)
+        split_sizes = [
+            (height // self.vqmodel.vision_spatial_factor) * (width // self.vqmodel.vision_spatial_factor + 1)
+            for height, width in image_sizes
+        ]
+        image_features = self.get_input_embeddings()(image_tokens)
+        image_features = torch.split(image_features, split_sizes)
+        return image_features
+
+    @torch.no_grad
+    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
+        """
+        Decodes generated image tokens from language model to continuous pixel values
+        with VQGAN module via upsampling.
+
+        Args:
+            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
+                The tensors corresponding to the input images.
+            height (`int`):
+                Height of the generated image before upsampling.
+            width (`int`):
+                Width of the generated image before upsampling.
+        """
+        sequences = image_tokens[:, :-3].view(-1, height, width + 1)
+        image_tokens = self.vocabulary_mapping.convert_bpe2img(sequences)
+        image = self.vqmodel.decode(image_tokens)
+        return image
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.vocabulary_mapping.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values, image_sizes)
+            image_embeds = torch.cat(image_embeds, dim=0)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_embeds)
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.text_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return outputs
+
+
+class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
+    base_model_prefix = ""
+    _tied_weights_keys = ["lm_head.weight"]
+    _checkpoint_conversion_mapping = {
+        "^text_model.model": "model.text_model",
+        "^vqmodel": "model.vqmodel",
+        "^text_model.lm_head": "lm_head",
+    }
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Emu3Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    # Make modules available through conditional class for BC
+    @property
+    def text_model(self):
+        return self.model.text_model
+
+    @property
+    def vqmodel(self):
+        return self.model.vqmodel
+
+    @property
+    def vocabulary_mapping(self):
+        return self.model.vocabulary_mapping
+
+    def decode_image_tokens(self, **kwargs):
+        return self.model.decode_image_tokens(**kwargs)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
+
+        >>> conversation = [
+        ...     {
+        ...     "role": "system",
+        ...     "content": [
+        ...         {"type": "text", "text": "You are a helpful assistant."},
+        ...         ],
+        ...     },
+        ...     {
+        ...     "role": "user",
+        ...     "content": [
+        ...         {"type": "image"},
+        ...         {"type": "text", "text": "Please describe the image."},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
+
+        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+
+        return model_inputs
+
+
+__all__ = [
+    "Emu3ForConditionalGeneration",
+    "Emu3ForCausalLM",
+    "Emu3TextModel",
+    "Emu3PreTrainedModel",
+    "Emu3VQVAE",
+    "Emu3Model",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/modular_emu3.py b/phivenv/Lib/site-packages/transformers/models/emu3/modular_emu3.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f5ee236ef06a5b653d0bbbe985725b2971a6c83
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/emu3/modular_emu3.py
@@ -0,0 +1,1223 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from functools import cached_property
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..chameleon.modeling_chameleon import (
+    ChameleonPreTrainedModel,
+    ChameleonVQVAEEncoderConvDownsample,
+)
+from ..llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, TransformersKwargs
+from ..siglip.modeling_siglip import SiglipAttention
+from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Emu3Attention(LlamaAttention):
+    pass
+
+
+# Has extra dropout which no other model in the library has
+class Emu3DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: Emu3Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.dropout(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.dropout(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEVectorQuantizer(nn.Module):
+    """
+    A module for vector quantization using learned embedding vectors.
+
+    This module implements the quantization process similar to te one described in
+    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
+    input vectors into discrete codebook vectors, which are learned during training.
+    Current implementation improves over previous ones by avoiding costly matrix multiplications
+    and allowing for post-hoc remapping of indices.
+    """
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+        self.embedding = nn.Embedding(config.codebook_size, config.embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / config.codebook_size, 1.0 / config.codebook_size)
+
+    def forward(self, hidden_state: torch.Tensor):
+        batch_size, temporal, channels, height, width = hidden_state.shape
+        hidden_state = hidden_state.permute(0, 1, 3, 4, 2).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, channels)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        hidden_state_sum = torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
+        embedding_sum = torch.sum(self.embedding.weight**2, dim=1)
+
+        # "bd,dn->bn",
+        distances = 2 * torch.matmul(hidden_state_flattened, self.embedding.weight.transpose(0, 1))
+        distances = hidden_state_sum + embedding_sum - distances
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        min_encoding_indices = min_encoding_indices.view(batch_size, temporal, height, width)
+        return min_encoding_indices
+
+
+class Emu3VQVAEEncoderConvDownsample(ChameleonVQVAEEncoderConvDownsample):
+    pass
+
+
+class Emu3VQVAEEncoderConvUpsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_states):
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+        kernel_size: tuple[int],
+        stride: tuple[int],
+    ):
+        super().__init__()
+
+        padding_sizes = [one_kernel - one_stride for one_kernel, one_stride in zip(kernel_size[1:], stride[1:])]
+        self.padding = ()
+        for pad_size in padding_sizes[::-1]:
+            self.padding += (pad_size // 2 + pad_size % 2, pad_size // 2)
+        self.padding += (2, 0)
+
+        self.conv = nn.Conv3d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = F.pad(hidden_states, self.padding)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAESpatialNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(
+            num_channels=out_channels,
+            num_groups=32,
+            eps=1e-6,
+            affine=True,
+        )
+
+        self.conv_y = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv_b = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        quant_states = F.interpolate(quant_states, size=hidden_states.shape[-2:], mode="nearest")
+        hidden_states = self.norm_layer(hidden_states)
+        hidden_states = hidden_states * self.conv_y(quant_states) + self.conv_b(quant_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, channels, temporal, height, width = hidden_states.shape
+        hidden_states = hidden_states.permute(0, 1, 3, 4, 2).contiguous().view(batch_size, -1, temporal)
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = hidden_states.view(batch_size, channels, height, width, -1).permute(0, 1, 4, 2, 3).contiguous()
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(4, 3, 3),
+            stride=(2, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        self.norm1 = nn.BatchNorm3d(in_channels)
+        self.conv1 = Emu3VQVAEConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        self.norm2 = nn.BatchNorm3d(out_channels)
+        self.conv2 = Emu3VQVAEConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        quant_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.quant_channels = quant_channels
+
+        if quant_channels is None:
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.norm1 = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+            self.norm2 = Emu3VQVAESpatialNorm(quant_channels, out_channels)
+
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states: torch.Tensor, quant_channels: Optional[torch.Tensor] = None):
+        norm_args = () if self.quant_channels is None else (quant_channels,)
+
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEAttentionBlock(SiglipAttention):
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__(config)
+
+        # for compatibility with the attention interface
+        self.num_key_value_groups = 1
+
+
+class Emu3VQVAEGroupNorm(nn.GroupNorm):
+    """
+    Same as the torch GroupNorm with the only difference that this ones accepts
+    an optional kwarg `quant_states` which is not used. This class makes it easier to
+    use SpatialNorm or GroupNorm without conditionals
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def forward(self, input, quant_states=None):
+        return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
+
+
+class Emu3VQVAEMiddleBlock(nn.Module):
+    def __init__(self, config, in_channels, quant_channels=None):
+        super().__init__()
+
+        self.block_1 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+        self.attn_1 = Emu3VQVAEAttentionBlock(config)
+        if quant_channels is None:
+            self.attn_norm = Emu3VQVAEGroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.attn_norm = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+
+        self.block_2 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: Optional[torch.FloatTensor] = None):
+        hidden_states = self.block_1(hidden_states, quant_states)
+        residual = hidden_states
+        hidden_states = self.attn_norm(hidden_states, quant_states)
+        batch_size, channels, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+        hidden_states = self.attn_1(hidden_states)[0]
+        hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+        hidden_states = residual + hidden_states
+        hidden_states = self.block_2(hidden_states, quant_states)
+        return hidden_states
+
+
+class Emu3VQVAEDownBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        channel_multiplier = config.channel_multiplier
+
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if config.attn_resolutions is not None and i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(nn.GroupNorm(num_channels=block_in, num_groups=32, eps=1e-6, affine=True))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            down.attn_norms = attn_norms
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Emu3VQVAEEncoderConvDownsample(block_in)
+            self.down.append(down)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.down):
+            for i_block in range(self.num_res_blocks):
+                hidden_states = blocks.block[i_block](hidden_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+
+            if i_level != self.num_resolutions - 1:
+                hidden_states = blocks.downsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEUpBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_out = config.base_channels * config.channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        quant_channels=quant_channels,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(Emu3VQVAESpatialNorm(quant_channels, block_in))
+
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            up.attn_norms = attn_norms
+            if i_level != 0:
+                up.upsample = Emu3VQVAEEncoderConvUpsample(block_in)
+
+            self.up.insert(0, up)
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.up[::-1]):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden_states = blocks.block[i_block](hidden_states, quant_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states, quant_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+            if i_level != len(self.up) - 1:
+                hidden_states = blocks.upsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        base_channels = config.base_channels
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+        out_channels = 2 * latent_channels if double_latent else latent_channels
+        block_in = base_channels * channel_multiplier[-1]
+
+        self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1)
+        self.down_block = Emu3VQVAEDownBlock(config)
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in)
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        temporal_down_blocks = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        self.time_res_stack = nn.ModuleList()
+
+        for i in range(temporal_down_blocks):
+            conv = Emu3VQVAETemporalDownsample(out_channels, out_channels)
+            self.time_conv.append(conv)
+
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=out_channels,
+                out_channels=out_channels,
+            )
+            self.time_res_stack.append(time_res_conv)
+
+    def forward(self, pixel_values: torch.LongTensor):
+        temporal_dim = pixel_values.shape[1]
+        pixel_values = pixel_values.reshape(-1, *pixel_values.shape[2:])
+
+        # downsampling & middle
+        hidden_states = self.conv_in(pixel_values)
+        hidden_states = self.down_block(hidden_states)
+        hidden_states = self.middle_block(hidden_states)
+
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(-1, temporal_dim, *hidden_states.shape[1:])
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for conv in self.time_conv:
+            hidden_states = conv(hidden_states)
+            hidden_states *= torch.sigmoid(hidden_states)
+
+        for layer in self.time_res_stack:
+            hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        return hidden_states
+
+
+class Emu3VQVAEDecoder(nn.Module):
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+        self.time_res_stack = nn.ModuleList()
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=config.latent_channels, out_channels=config.latent_channels
+            )
+            self.time_res_stack.append(time_res_conv)
+
+        temp_upsample_block_num = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        for i in range(temp_upsample_block_num):
+            conv = Emu3VQVAETemporalUpsample(config.latent_channels, config.latent_channels)
+            self.time_conv.append(conv)
+
+        self.conv_in = nn.Conv2d(
+            config.latent_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in, quant_channels=quant_channels)
+        self.up_block = Emu3VQVAEUpBlock(config)
+
+        block_in = config.base_channels * config.channel_multiplier[0]
+        self.norm_out = Emu3VQVAESpatialNorm(quant_channels, block_in)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        hidden_quant_states = torch.cat((hidden_states, quant_states), dim=0)
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for layer in self.time_res_stack:
+            hidden_quant_states = layer(hidden_quant_states)
+
+        for layer in self.time_conv:
+            hidden_quant_states = layer(hidden_quant_states)
+            hidden_quant_states *= torch.sigmoid(hidden_quant_states)
+
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+        hidden_states, quant_states = torch.chunk(hidden_quant_states, 2, dim=0)
+        hidden_states = hidden_states.reshape(-1, *hidden_states.shape[2:])
+        quant_states = quant_states.reshape(-1, *quant_states.shape[2:])
+
+        hidden_states = self.conv_in(hidden_states)
+
+        # middle & upsampling
+        hidden_states = self.middle_block(hidden_states, quant_states)
+        hidden_states = self.up_block(hidden_states, quant_states)
+
+        hidden_states = self.norm_out(hidden_states, quant_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
+    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
+    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
+    Taigman](https://huggingface.co/papers/2203.13131).
+    """
+)
+class Emu3VQVAE(PreTrainedModel):
+    config: Emu3VQVAEConfig
+    base_model_prefix = "emuvideovq"
+    main_input_name = "pixel_values"
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _no_split_modules = [
+        "Emu3VQVAETemporalResnetBlock",
+        "Emu3VQVAEAttentionBlock",
+        "Emu3VQVAEResnetBlock",
+        "Emu3VQVAEVectorQuantizer",
+    ]
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in)
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1.0)
+            nn.init.constant_(module.bias, 0.0)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_()
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__(config)
+
+        self.config = config
+
+        self.encoder = Emu3VQVAEEncoder(config)
+        self.decoder = Emu3VQVAEDecoder(config)
+        self.quantize = Emu3VQVAEVectorQuantizer(config)
+        self.vision_spatial_factor = 2 ** (len(config.channel_multiplier) - 1)
+
+        self.quant_conv = Emu3VQVAEConv3d(
+            config.latent_channels, config.embed_dim, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.post_quant_conv = Emu3VQVAEConv3d(
+            config.embed_dim, config.latent_channels, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.spatial_scale_factor = 2 ** (len(config.channel_multiplier) - 1)
+        self.eval()  # Emu3's VQ model is frozen
+
+        self.post_init()
+
+    def encode(self, pixel_values: torch.Tensor, image_sizes: torch.Tensor):
+        is_image = pixel_values.ndim == 4
+        if is_image:
+            temporal = self.config.temporal_downsample_factor
+            batch_size, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.unsqueeze(1).repeat(1, temporal, 1, 1, 1)
+        else:
+            batch_size, temporal, channels, height, width = pixel_values.shape
+
+        hidden_states = self.encoder(pixel_values)
+
+        # b t c h w -> b c t h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        hidden_states = self.quant_conv(hidden_states)
+
+        # b c t h w -> b t c h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        codes = self.quantize(hidden_states)
+
+        image_tokens = codes.squeeze(1) if is_image else codes
+
+        image_tokens = [
+            single_image[: int(size[0] / self.vision_spatial_factor), : int(size[1] / self.vision_spatial_factor)]
+            for single_image, size in zip(image_tokens, image_sizes)
+        ]
+
+        return image_tokens
+
+    def decode(self, hidden_states: torch.Tensor):
+        is_image = hidden_states.ndim == 3
+        if is_image:
+            hidden_states = hidden_states.unsqueeze(1)
+
+        batch_size, temporal, height, width = hidden_states.shape
+        quant = self.quantize.embedding(hidden_states.flatten())
+
+        channels = quant.shape[-1]
+        quant = quant.view(batch_size, temporal, height, width, channels).permute(0, 4, 1, 2, 3).contiguous()
+        post_quant = self.post_quant_conv(quant)
+
+        quant = quant.permute(0, 2, 1, 3, 4)
+        post_quant = post_quant.permute(0, 2, 1, 3, 4)
+
+        video = self.decoder(post_quant, quant)
+        video = video.reshape(
+            batch_size,
+            temporal * self.config.temporal_downsample_factor,
+            self.config.out_channels,
+            height * self.spatial_scale_factor,
+            width * self.spatial_scale_factor,
+        )
+        return video[:, 0] if is_image else video
+
+
+class Emu3ImageVocabularyMapping:
+    """
+    A class for mapping discrete image tokens from VQGAN to BPE tokens.
+    """
+
+    def __init__(self, vocab_map):
+        self.vocab_map = vocab_map
+        self.eol_token_id = vocab_map.get("<|extra_200|>")
+        self.image_token_id = vocab_map.get("<image>")
+
+    @cached_property
+    def image_tokens(self):
+        return sorted([val for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def image_tokens_str(self):
+        return sorted([name for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def img2bpe(self):
+        return {int(token[-8:-2]): self.vocab_map[token] for token in self.image_tokens_str}
+
+    @cached_property
+    def bpe2img(self):
+        return {v: k for k, v in self.img2bpe.items()}
+
+    @cached_property
+    def bpe2img_mapping_tensor(self):
+        mapping = torch.zeros(max(self.bpe2img.keys()) + 1, dtype=torch.int)
+        for k, v in self.bpe2img.items():
+            mapping[k] = v
+        return mapping
+
+    @cached_property
+    def img2bpe_mapping_tensor(self):
+        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+
+    def convert_img2bpe(self, img_batch: list[torch.Tensor]) -> torch.Tensor:
+        device = img_batch.device
+        eol_row = torch.ones((img_batch.shape[0], 1), dtype=torch.int) * self.eol_token_id
+        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
+        img_tokens = torch.cat([img_tokens, eol_row], dim=-1)
+        return img_tokens.to(device)
+
+    def convert_bpe2img(self, img_batch: torch.Tensor) -> torch.Tensor:
+        device = img_batch.device
+        img_batch = img_batch[..., :-1]  # remove last row of EOL tokens
+        img_tokens = self.bpe2img_mapping_tensor[img_batch.to("cpu")]
+        return img_tokens.to(device)
+
+
+class Emu3PreTrainedModel(ChameleonPreTrainedModel, Emu3VQVAE):
+    _no_split_modules = [
+        "Emu3DecoderLayer",
+    ]
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+class Emu3TextModel(LlamaModel, Emu3PreTrainedModel):
+    _can_record_outputs = {
+        "hidden_states": Emu3DecoderLayer,
+        "attentions": Emu3Attention,
+    }
+
+    def __init__(self, config: Emu3Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Emu3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+
+class Emu3ForCausalLM(LlamaForCausalLM, Emu3PreTrainedModel, GenerationMixin):
+    config: Emu3TextConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Emu3TextModel(config)
+
+    def forward(**super_kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
+
+        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        super().forward()
+
+
+class Emu3Model(Emu3PreTrainedModel):
+    _checkpoint_conversion_mapping = {"text_model.model": "text_model"}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_model = Emu3TextModel._from_config(config.text_config)
+        self.vqmodel = Emu3VQVAE(config.vq_config)
+        self.vocabulary_mapping = Emu3ImageVocabularyMapping(config.vocabulary_map)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.text_model = decoder
+
+    def get_decoder(self):
+        return self.text_model
+
+    def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module. Converts
+        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
+        special tokens.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+                The sizes of the images in the batch, being (height, width) for each image.
+        """
+        image_tokens_list = self.vqmodel.encode(pixel_values, image_sizes)
+        bpe_tokens_list = [self.vocabulary_mapping.convert_img2bpe(tokens).flatten() for tokens in image_tokens_list]
+        bpe_tokens = torch.cat(bpe_tokens_list)
+        return bpe_tokens
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module and embeds
+        them with text embeddings layer
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+                The tensors corresponding to the input images.
+        """
+        image_tokens = self.get_image_tokens(pixel_values, image_sizes)
+        split_sizes = [
+            (height // self.vqmodel.vision_spatial_factor) * (width // self.vqmodel.vision_spatial_factor + 1)
+            for height, width in image_sizes
+        ]
+        image_features = self.get_input_embeddings()(image_tokens)
+        image_features = torch.split(image_features, split_sizes)
+        return image_features
+
+    @torch.no_grad
+    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
+        """
+        Decodes generated image tokens from language model to continuous pixel values
+        with VQGAN module via upsampling.
+
+        Args:
+            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
+                The tensors corresponding to the input images.
+            height (`int`):
+                Height of the generated image before upsampling.
+            width (`int`):
+                Width of the generated image before upsampling.
+        """
+        sequences = image_tokens[:, :-3].view(-1, height, width + 1)
+        image_tokens = self.vocabulary_mapping.convert_bpe2img(sequences)
+        image = self.vqmodel.decode(image_tokens)
+        return image
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.vocabulary_mapping.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values, image_sizes)
+            image_embeds = torch.cat(image_embeds, dim=0)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_embeds)
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.text_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return outputs
+
+
+class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
+    base_model_prefix = ""
+    _tied_weights_keys = ["lm_head.weight"]
+    _checkpoint_conversion_mapping = {
+        "^text_model.model": "model.text_model",
+        "^vqmodel": "model.vqmodel",
+        "^text_model.lm_head": "lm_head",
+    }
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Emu3Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    # Make modules available through conditional class for BC
+    @property
+    def text_model(self):
+        return self.model.text_model
+
+    @property
+    def vqmodel(self):
+        return self.model.vqmodel
+
+    @property
+    def vocabulary_mapping(self):
+        return self.model.vocabulary_mapping
+
+    def decode_image_tokens(self, **kwargs):
+        return self.model.decode_image_tokens(**kwargs)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
+
+        >>> conversation = [
+        ...     {
+        ...     "role": "system",
+        ...     "content": [
+        ...         {"type": "text", "text": "You are a helpful assistant."},
+        ...         ],
+        ...     },
+        ...     {
+        ...     "role": "user",
+        ...     "content": [
+        ...         {"type": "image"},
+        ...         {"type": "text", "text": "Please describe the image."},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
+
+        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+
+        return model_inputs
+
+
+__all__ = [
+    "Emu3ForConditionalGeneration",
+    "Emu3ForCausalLM",
+    "Emu3TextModel",
+    "Emu3PreTrainedModel",
+    "Emu3VQVAE",
+    "Emu3Model",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/emu3/processing_emu3.py b/phivenv/Lib/site-packages/transformers/models/emu3/processing_emu3.py
new file mode 100644
index 0000000000000000000000000000000000000000..19ad898737691dfa8ea6c9a8fca21a3d5c9cf88d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/emu3/processing_emu3.py
@@ -0,0 +1,248 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import is_vision_available
+
+
+if is_vision_available():
+    from .image_processing_emu3 import smart_resize
+
+
+class Emu3TextKwargs(TextKwargs, total=False):
+    return_for_image_generation: bool
+
+
+class Emu3ImagesKwargs(ImagesKwargs, total=False):
+    ratio: str
+    image_area: int
+
+
+class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: Emu3TextKwargs
+    images_kwargs: Emu3ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "return_for_image_generation": False,
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {
+            "ratio": "1:1",
+            "image_area": 518400,
+        },
+    }
+
+
+class Emu3Processor(ProcessorMixin):
+    r"""
+    Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single
+    processor.
+
+    [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`].
+    See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information.
+
+    Args:
+        image_processor ([`Emu3ImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`Emu3TokenizerFast`]):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
+    image_processor_class = "Emu3ImageProcessor"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        **kwargs,
+    ):
+        self.image_token = tokenizer.image_token  # image_token as placeholder to be replaced by vq-vae tokens
+        self.image_token_id = tokenizer.image_token_id
+        self.image_start_token = tokenizer.boi_token  # "<|image start|>" fixed tokens for start and end of image
+        self.image_end_token = tokenizer.eoi_token  # "<|image end|>"
+        self.fake_token_around_image = tokenizer.image_wrapper_token  # "<|image token|>"  every image starts with it
+        self.eof_token = tokenizer.eof_token  # "<|extra_201|>"
+        self.bos_token = tokenizer.bos_token
+        self.downsample_ratio = 8
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Emu3ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        # check if images and text inputs are reversed for BC
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        output_kwargs = self._merge_kwargs(
+            Emu3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        return_for_image_generation = output_kwargs["text_kwargs"].pop("return_for_image_generation", False)
+        ratio = output_kwargs["images_kwargs"].pop("ratio", None)
+        image_area = output_kwargs["images_kwargs"].pop("image_area", None)
+
+        if return_for_image_generation and images is not None:
+            raise ValueError("You should not provide `images` when `return_for_image_generation=True`")
+
+        if not return_for_image_generation and text is None and images is None:
+            raise ValueError("You must provide either text or images when `return_for_image_generation=False`")
+
+        image_features = {}
+        image_start_tokens = f"{self.image_start_token}"
+        image_end_tokens = f"{self.eof_token}{self.image_end_token}"
+
+        # generate text from image + text input, so we add placeholders for image tokens
+        if not return_for_image_generation and images is not None:
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_sizes = iter(image_features.image_sizes)
+
+            prompt_strings = []
+            for sample in text:
+                while self.image_token in sample:
+                    image_size = next(image_sizes)
+                    height, width = image_size
+                    height = height // self.downsample_ratio
+                    width = width // self.downsample_ratio
+                    image_seq_length = height * (width + 1)  # +1 for extra row when converting to BPE in modeling code
+
+                    image_placeholder = f"{image_start_tokens}{height}*{width}{self.fake_token_around_image}{'<placeholder>' * image_seq_length}{image_end_tokens}"
+                    sample = sample.replace(self.image_token, image_placeholder, 1)
+                    sample = f"{self.bos_token}{sample}"  # add BOS because GPT tokenizer doesn't add it
+                prompt_strings.append(sample)
+            text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
+
+        # generate image from text input, so we add begin-of-image tokens from where image generation starts
+        elif return_for_image_generation:
+            height, width = self.calculate_generate_size(ratio, image_area, self.downsample_ratio)
+            image_prompt = f"{image_start_tokens}{height}*{width}{self.fake_token_around_image}"
+            text = [f"{self.bos_token}{sample}{image_prompt}" for sample in text]
+            image_features["image_sizes"] = [[height, width]] * len(text)
+
+        # else just generate from text-only input, and we do no special treatment for text
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_features}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            num_image_tokens = []
+            for height, width in image_sizes:
+                height, width = smart_resize(
+                    height,
+                    width,
+                    self.image_processor.spatial_factor,
+                    self.image_processor.min_pixels,
+                    self.image_processor.max_pixels,
+                )
+                height = height // self.downsample_ratio
+                width = width // self.downsample_ratio
+                image_seq_length = height * (width + 1)  # +1 for extra row when converting to BPE in modeling code
+                num_image_tokens.append(image_seq_length)
+
+            num_image_patches = [1] * len(image_sizes)
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    def calculate_generate_size(self, ratio, image_area, spatial_factor):
+        width, height = map(int, ratio.split(":"))
+        current_area = width * height
+        target_ratio = (image_area / current_area) ** 0.5
+
+        token_height = int(round(height * target_ratio / spatial_factor))
+        token_width = int(round(width * target_ratio / spatial_factor))
+        return token_height, token_width
+
+    def postprocess(self, images: ImageInput, **kwargs):
+        return self.image_processor.postprocess(images, **kwargs)
+
+
+__all__ = ["Emu3Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/__init__.py b/phivenv/Lib/site-packages/transformers/models/encodec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3adeea056604d1d31f946a5cd0bf53ea590ea3aa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encodec/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_encodec import *
+    from .feature_extraction_encodec import *
+    from .modeling_encodec import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc1c337e008baa6ce2a39fdd8ab1421d428b4e08
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d804f948d34df6bb9f0d3343b4760050d9659745
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba24960177447673e3bb9fb5fb1bc23b99b27634
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fffb5b4bec7d49138808a3d30b476a6452dc2b16
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/configuration_encodec.py b/phivenv/Lib/site-packages/transformers/models/encodec/configuration_encodec.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bb7b11a40c34d013e13fe3e82e466a0d863a5c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encodec/configuration_encodec.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""EnCodec model configuration"""
+
+import math
+from typing import Optional
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EncodecConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`EncodecModel`]. It is used to instantiate a
+    Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        target_bandwidths (`list[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`):
+            The range of different bandwidths the model can encode audio with.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        audio_channels (`int`, *optional*, defaults to 1):
+            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
+        normalize (`bool`, *optional*, defaults to `False`):
+            Whether the audio shall be normalized when passed.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+        hidden_size (`int`, *optional*, defaults to 128):
+            Intermediate representation dimension.
+        num_filters (`int`, *optional*, defaults to 32):
+            Number of convolution kernels of first `EncodecConv1d` down sampling layer.
+        num_residual_layers (`int`,  *optional*, defaults to 1):
+            Number of residual layers.
+        upsampling_ratios (`Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]`):
+            Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
+            will use the ratios in the reverse order to the ones specified here that must match the decoder order.
+        norm_type (`str`, *optional*, defaults to `"weight_norm"`):
+            Normalization method. Should be in `["weight_norm", "time_group_norm"]`
+        kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for the initial convolution.
+        last_kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for the last convolution layer.
+        residual_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the residual layers.
+        dilation_growth_rate (`int`, *optional*, defaults to 2):
+            How much to increase the dilation with each layer.
+        use_causal_conv (`bool`, *optional*, defaults to `True`):
+            Whether to use fully causal convolution.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            Padding mode for the convolutions.
+        compress (`int`, *optional*, defaults to 2):
+            Reduced dimensionality in residual branches (from Demucs v3).
+        num_lstm_layers (`int`, *optional*, defaults to 2):
+            Number of LSTM layers at the end of the encoder.
+        trim_right_ratio (`float`, *optional*, defaults to 1.0):
+            Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
+            equal to 1.0, it means that all the trimming is done at the right.
+        codebook_size (`int`, *optional*, defaults to 1024):
+            Number of discret codes that make up VQVAE.
+        codebook_dim (`int`, *optional*):
+            Dimension of the codebook vectors. If not defined, uses `hidden_size`.
+        use_conv_shortcut (`bool`, *optional*, defaults to `True`):
+            Whether to use a convolutional layer as the 'skip' connection in the `EncodecResnetBlock` block. If False,
+            an identity function will be used, giving a generic residual connection.
+
+    Example:
+
+    ```python
+    >>> from transformers import EncodecModel, EncodecConfig
+
+    >>> # Initializing a "facebook/encodec_24khz" style configuration
+    >>> configuration = EncodecConfig()
+
+    >>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
+    >>> model = EncodecModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "encodec"
+
+    def __init__(
+        self,
+        target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0],
+        sampling_rate=24_000,
+        audio_channels=1,
+        normalize=False,
+        chunk_length_s=None,
+        overlap=None,
+        hidden_size=128,
+        num_filters=32,
+        num_residual_layers=1,
+        upsampling_ratios=[8, 5, 4, 2],
+        norm_type="weight_norm",
+        kernel_size=7,
+        last_kernel_size=7,
+        residual_kernel_size=3,
+        dilation_growth_rate=2,
+        use_causal_conv=True,
+        pad_mode="reflect",
+        compress=2,
+        num_lstm_layers=2,
+        trim_right_ratio=1.0,
+        codebook_size=1024,
+        codebook_dim=None,
+        use_conv_shortcut=True,
+        **kwargs,
+    ):
+        self.target_bandwidths = target_bandwidths
+        self.sampling_rate = sampling_rate
+        self.audio_channels = audio_channels
+        self.normalize = normalize
+        self.chunk_length_s = chunk_length_s
+        self.overlap = overlap
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.num_residual_layers = num_residual_layers
+        self.upsampling_ratios = upsampling_ratios
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.last_kernel_size = last_kernel_size
+        self.residual_kernel_size = residual_kernel_size
+        self.dilation_growth_rate = dilation_growth_rate
+        self.use_causal_conv = use_causal_conv
+        self.pad_mode = pad_mode
+        self.compress = compress
+        self.num_lstm_layers = num_lstm_layers
+        self.trim_right_ratio = trim_right_ratio
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
+        self.use_conv_shortcut = use_conv_shortcut
+
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        super().__init__(**kwargs)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_length(self) -> Optional[int]:
+        if self.chunk_length_s is None:
+            return None
+        else:
+            return int(self.chunk_length_s * self.sampling_rate)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_stride(self) -> Optional[int]:
+        if self.chunk_length_s is None or self.overlap is None:
+            return None
+        else:
+            return max(1, int((1.0 - self.overlap) * self.chunk_length))
+
+    @property
+    def hop_length(self) -> int:
+        return int(np.prod(self.upsampling_ratios))
+
+    @property
+    def codebook_nbits(self) -> int:
+        return math.ceil(math.log2(self.codebook_size))
+
+    @property
+    def frame_rate(self) -> int:
+        return math.ceil(self.sampling_rate / self.hop_length)
+
+    @property
+    def num_quantizers(self) -> int:
+        return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * self.codebook_nbits))
+
+
+__all__ = ["EncodecConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/feature_extraction_encodec.py b/phivenv/Lib/site-packages/transformers/models/encodec/feature_extraction_encodec.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc8d523f7f056a68639e5eefd7c1be6b196c7d6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encodec/feature_extraction_encodec.py
@@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for EnCodec."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EncodecFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an EnCodec feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
+    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+    """
+
+    model_input_names = ["input_values", "padding_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 24000,
+        padding_value: float = 0.0,
+        chunk_length_s: Optional[float] = None,
+        overlap: Optional[float] = None,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.chunk_length_s = chunk_length_s
+        self.overlap = overlap
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_length(self) -> Optional[int]:
+        if self.chunk_length_s is None:
+            return None
+        else:
+            return int(self.chunk_length_s * self.sampling_rate)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_stride(self) -> Optional[int]:
+        if self.chunk_length_s is None or self.overlap is None:
+            return None
+        else:
+            return max(1, int((1.0 - self.overlap) * self.chunk_length))
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.shape[-1] != 2:
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        padded_inputs = None
+        input_values = BatchFeature({"input_values": raw_audio})
+        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
+            if truncation:
+                max_length = min(array.shape[0] for array in raw_audio)
+                nb_step = int(np.floor(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+            elif padding:
+                max_length = max(array.shape[0] for array in raw_audio)
+                nb_step = int(np.ceil(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+                padding = "max_length"
+            else:
+                padded_inputs = input_values
+
+        # normal padding on batch
+        if padded_inputs is None:
+            padded_inputs = self.pad(
+                input_values,
+                max_length=max_length,
+                truncation=truncation,
+                padding=padding,
+                return_attention_mask=padding,
+            )
+            if padding:
+                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["EncodecFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/encodec/modeling_encodec.py b/phivenv/Lib/site-packages/transformers/models/encodec/modeling_encodec.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c32f5bd61d03f680fbc291323b7161244ed025
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encodec/modeling_encodec.py
@@ -0,0 +1,813 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch EnCodec model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...modeling_utils import PreTrainedAudioTokenizerBase
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    logging,
+)
+from .configuration_encodec import EncodecConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+
+
+@dataclass
+@auto_docstring
+class EncodecOutput(ModelOutput):
+    r"""
+    audio_codes (`torch.LongTensor`  of shape `(nb_frames, batch_size, nb_quantizers, frame_len)`, *optional*):
+        Discrete code embeddings computed using `model.encode`.
+    audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
+        Decoded audio values, obtained using the decoder part of Encodec.
+    """
+
+    audio_codes: Optional[torch.LongTensor] = None
+    audio_values: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring
+class EncodecEncoderOutput(ModelOutput):
+    r"""
+    audio_codes (`torch.LongTensor`  of shape `(nb_frames, batch_size, nb_quantizers, frame_len)`, *optional*):
+        Discrete code embeddings computed using `model.encode`.
+    audio_scales (list of length `nb_frames` of `torch.Tensor` of shape `(batch_size, 1)`, *optional*):
+        Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
+    last_frame_pad_length (`int`, *optional*):
+        The length of the padding in the last frame, if any. This is used to ensure that the encoded frames can be
+        outputted as a tensor. This value should be passed during decoding to ensure padding is removed from the
+        encoded frames.
+    """
+
+    audio_codes: Optional[torch.LongTensor] = None
+    audio_scales: Optional[torch.FloatTensor] = None
+    last_frame_pad_length: Optional[int] = None
+
+
+@dataclass
+@auto_docstring
+class EncodecDecoderOutput(ModelOutput):
+    r"""
+    audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
+        Decoded audio values, obtained using the decoder part of Encodec.
+    """
+
+    audio_values: Optional[torch.FloatTensor] = None
+
+
+class EncodecConv1d(nn.Module):
+    """Conv1d with asymmetric or causal padding and normalization."""
+
+    def __init__(
+        self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
+    ):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.pad_mode = config.pad_mode
+        self.norm_type = config.norm_type
+
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            logger.warning(
+                "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if self.norm_type == "weight_norm":
+            self.conv = weight_norm(self.conv)
+        elif self.norm_type == "time_group_norm":
+            self.norm = nn.GroupNorm(1, out_channels)
+
+        kernel_size = self.conv.kernel_size[0]
+        stride = torch.tensor(self.conv.stride[0], dtype=torch.int64)
+        dilation = self.conv.dilation[0]
+
+        # Effective kernel size with dilations.
+        kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64)
+
+        self.register_buffer("stride", stride, persistent=False)
+        self.register_buffer("kernel_size", kernel_size, persistent=False)
+        self.register_buffer("padding_total", kernel_size - stride, persistent=False)
+
+    def _get_extra_padding_for_conv1d(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """See `pad_for_conv1d`."""
+        length = hidden_states.shape[-1]
+        n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1
+        n_frames = torch.ceil(n_frames).to(torch.int64) - 1
+        ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total
+
+        return ideal_length - length
+
+    @staticmethod
+    def _pad1d(hidden_states: torch.Tensor, paddings: tuple[int, int], mode: str = "zero", value: float = 0.0):
+        """Tiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
+        If this is the case, we insert extra 0 padding to the right before the reflection happens.
+        """
+        length = hidden_states.shape[-1]
+        padding_left, padding_right = paddings
+        if mode != "reflect":
+            return nn.functional.pad(hidden_states, paddings, mode, value)
+
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
+        padded = nn.functional.pad(hidden_states, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+
+    def forward(self, hidden_states):
+        extra_padding = self._get_extra_padding_for_conv1d(hidden_states)
+
+        if self.causal:
+            # Left padding for causal
+            hidden_states = self._pad1d(hidden_states, (self.padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = self.padding_total // 2
+            padding_left = self.padding_total - padding_right
+            hidden_states = self._pad1d(
+                hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+            )
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm_type == "time_group_norm":
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class EncodecConvTranspose1d(nn.Module):
+    """ConvTranspose1d with asymmetric or causal padding and normalization."""
+
+    def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.trim_right_ratio = config.trim_right_ratio
+        self.norm_type = config.norm_type
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if config.norm_type == "weight_norm":
+            self.conv = weight_norm(self.conv)
+        elif config.norm_type == "time_group_norm":
+            self.norm = nn.GroupNorm(1, out_channels)
+
+        if not (self.causal or self.trim_right_ratio == 1.0):
+            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")
+
+    def forward(self, hidden_states):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        padding_total = kernel_size - stride
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm_type == "time_group_norm":
+            hidden_states = self.norm(hidden_states)
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+
+        padding_left = padding_total - padding_right
+
+        # unpad
+        end = hidden_states.shape[-1] - padding_right
+        hidden_states = hidden_states[..., padding_left:end]
+        return hidden_states
+
+
+class EncodecLSTM(nn.Module):
+    """
+    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
+    """
+
+    def __init__(self, config: EncodecConfig, dimension: int):
+        super().__init__()
+        self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(2, 0, 1)
+        hidden_states = self.lstm(hidden_states)[0] + hidden_states
+        hidden_states = hidden_states.permute(1, 2, 0)
+        return hidden_states
+
+
+class EncodecResnetBlock(nn.Module):
+    """
+    Residual block from SEANet model as used by EnCodec.
+    """
+
+    def __init__(self, config: EncodecConfig, dim: int, dilations: list[int]):
+        super().__init__()
+        kernel_sizes = (config.residual_kernel_size, 1)
+        if len(kernel_sizes) != len(dilations):
+            raise ValueError("Number of kernel sizes should match number of dilations")
+
+        hidden = dim // config.compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [nn.ELU()]
+            block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
+        self.block = nn.ModuleList(block)
+
+        if config.use_conv_shortcut:
+            self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
+        else:
+            self.shortcut = nn.Identity()
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        for layer in self.block:
+            hidden_states = layer(hidden_states)
+
+        return self.shortcut(residual) + hidden_states
+
+
+class EncodecEncoder(nn.Module):
+    """SEANet encoder as used by EnCodec."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
+        scaling = 1
+
+        # Downsample to raw audio scale
+        for ratio in reversed(config.upsampling_ratios):
+            current_scale = scaling * config.num_filters
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
+            # Add downsampling layers
+            model += [nn.ELU()]
+            model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
+            scaling *= 2
+
+        model += [EncodecLSTM(config, scaling * config.num_filters)]
+        model += [nn.ELU()]
+        model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]
+
+        self.layers = nn.ModuleList(model)
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class EncodecDecoder(nn.Module):
+    """SEANet decoder as used by EnCodec."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        scaling = int(2 ** len(config.upsampling_ratios))
+        model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]
+
+        model += [EncodecLSTM(config, scaling * config.num_filters)]
+
+        # Upsample to raw audio scale
+        for ratio in config.upsampling_ratios:
+            current_scale = scaling * config.num_filters
+            # Add upsampling layers
+            model += [nn.ELU()]
+            model += [
+                EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
+            ]
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
+            scaling //= 2
+
+        # Add final layers
+        model += [nn.ELU()]
+        model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
+        self.layers = nn.ModuleList(model)
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class EncodecEuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        embed = torch.zeros(config.codebook_size, config.codebook_dim)
+
+        self.codebook_size = config.codebook_size
+
+        self.register_buffer("inited", torch.Tensor([True]))
+        self.register_buffer("cluster_size", torch.zeros(config.codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+
+    def quantize(self, hidden_states):
+        embed = self.embed.t()
+        scaled_states = hidden_states.pow(2).sum(1, keepdim=True)
+        dist = -(scaled_states - 2 * hidden_states @ embed + embed.pow(2).sum(0, keepdim=True))
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+
+    def encode(self, hidden_states):
+        shape = hidden_states.shape
+        # pre-process
+        hidden_states = hidden_states.reshape((-1, shape[-1]))
+        # quantize
+        embed_ind = self.quantize(hidden_states)
+        # post-process
+        embed_ind = embed_ind.view(*shape[:-1])
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = nn.functional.embedding(embed_ind, self.embed)
+        return quantize
+
+
+class EncodecVectorQuantization(nn.Module):
+    """
+    Vector quantization implementation. Currently supports only euclidean distance.
+    """
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        self.codebook = EncodecEuclideanCodebook(config)
+
+    def encode(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        embed_in = self.codebook.encode(hidden_states)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self.codebook.decode(embed_ind)
+        quantize = quantize.permute(0, 2, 1)
+        return quantize
+
+
+class EncodecResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        self.codebook_size = config.codebook_size
+        self.frame_rate = config.frame_rate
+        self.num_quantizers = config.num_quantizers
+        self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])
+
+    def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
+        """Return num_quantizers based on specified target bandwidth."""
+        bw_per_q = math.log2(self.codebook_size) * self.frame_rate
+        num_quantizers = self.num_quantizers
+        if bandwidth is not None and bandwidth > 0.0:
+            num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
+        return num_quantizers
+
+    def encode(self, embeddings: torch.Tensor, bandwidth: Optional[float] = None) -> torch.Tensor:
+        """
+        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
+        the appropriate number of quantizers to use and returns indices for each quantizer.
+        """
+        num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
+        residual = embeddings
+        all_indices = []
+        for layer in self.layers[:num_quantizers]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        quantized_out = torch.tensor(0.0, device=codes.device)
+        for i, indices in enumerate(codes):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+
+
+@auto_docstring
+class EncodecPreTrainedModel(PreTrainedAudioTokenizerBase):
+    config: EncodecConfig
+    base_model_prefix = "encodec"
+    main_input_name = "input_values"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.GroupNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.ConvTranspose1d):
+            module.reset_parameters()
+        elif isinstance(module, nn.LSTM):
+            for name, param in module.named_parameters():
+                if "weight" in name:
+                    nn.init.xavier_uniform_(param)
+                elif "bias" in name:
+                    nn.init.constant_(param, 0.0)
+
+
+@auto_docstring(
+    custom_intro="""
+    The EnCodec neural audio codec model.
+    """
+)
+class EncodecModel(EncodecPreTrainedModel):
+    def __init__(self, config: EncodecConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = EncodecEncoder(config)
+        self.decoder = EncodecDecoder(config)
+
+        self.quantizer = EncodecResidualVectorQuantizer(config)
+
+        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+        if 2**self.bits_per_codebook != self.config.codebook_size:
+            raise ValueError("The codebook_size must be a power of 2.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _encode_frame(
+        self, input_values: torch.Tensor, bandwidth: float
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Encodes the given input using the underlying VQVAE. If `config.normalize` is set to `True` the input is first
+        normalized. The padding mask is required to compute the correct scale.
+        """
+        length = input_values.shape[-1]
+        duration = length / self.config.sampling_rate
+
+        if self.config.chunk_length_s is not None and duration > 1e-5 + self.config.chunk_length_s:
+            raise RuntimeError(f"Duration of frame ({duration}) is longer than chunk {self.config.chunk_length_s}")
+
+        scale = None
+        if self.config.normalize:
+            mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1]
+            scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8
+            input_values = input_values / scale
+            scale = scale.view(-1, 1)
+
+        embeddings = self.encoder(input_values)
+        codes = self.quantizer.encode(embeddings, bandwidth)
+        codes = codes.transpose(0, 1)
+        return codes, scale
+
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        bandwidth: Optional[float] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor, Optional[torch.Tensor], int], EncodecEncoderOutput]:
+        """
+        Encodes the input audio waveform into discrete codes of shape
+        `(nb_frames, batch_size, nb_quantizers, frame_len)`.
+
+        - `nb_frames=1` if `self.config.chunk_length=None` (as the encoder is applied on the full audio), which is the
+        case for the 24kHz model. Otherwise, `nb_frames=ceil(input_length/self.config.chunk_stride)`, which is the case
+        for the 48kHz model.
+        - `frame_len` is the length of each frame, which is equal to `ceil(input_length/self.config.hop_length)` if
+        `self.config.chunk_length=None` (e.g., for the 24kHz model). Otherwise, if `self.config.chunk_length` is
+        defined, `frame_len=self.config.chunk_length/self.config.hop_length`, e.g., the case for the 48kHz model with
+        `frame_len=150`.
+
+        Args:
+            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Float values of the input audio waveform.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+            bandwidth (`float`, *optional*):
+                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
+                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
+                as bandwidth == 6.0
+
+        Returns:
+            EncodecEncoderOutput dict or a tuple containing:
+            - audio_codes (`torch.LongTensor`  of shape `(nb_frames, batch_size, nb_quantizers, frame_len)`, *optional*),
+            - audio_scales (list of length `nb_frames` of `torch.Tensor` of shape `(batch_size, 1)`, *optional*),
+            - last_frame_pad_length (`int`, *optional*).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if bandwidth is None:
+            bandwidth = self.config.target_bandwidths[0]
+        if bandwidth not in self.config.target_bandwidths:
+            raise ValueError(
+                f"This model doesn't support the bandwidth {bandwidth}. Select one of {self.config.target_bandwidths}."
+            )
+
+        _, channels, input_length = input_values.shape
+
+        if channels < 1 or channels > 2:
+            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
+
+        chunk_length = self.config.chunk_length
+        if chunk_length is None:
+            chunk_length = input_length
+            stride = input_length
+        else:
+            stride = self.config.chunk_stride
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+        else:
+            padding_mask = padding_mask.view(padding_mask.shape[0], -1, padding_mask.shape[-1])
+
+        encoded_frames = []
+        scales = []
+        for offset in range(0, input_length, stride):
+            mask = padding_mask[..., offset : offset + chunk_length].bool()
+            frame = mask * input_values[..., offset : offset + chunk_length]
+            encoded_frame, scale = self._encode_frame(frame, bandwidth)
+            encoded_frames.append(encoded_frame)
+            scales.append(scale)
+
+        # pad last frame (if necessary) to be able to apply `torch.stack`
+        last_frame_pad_length = encoded_frames[0].shape[-1] - encoded_frames[-1].shape[-1]
+        if last_frame_pad_length > 0:
+            last_frame = nn.functional.pad(encoded_frames[-1], (0, last_frame_pad_length), value=0)
+            encoded_frames[-1] = last_frame
+        encoded_frames = torch.stack(encoded_frames)
+
+        if not return_dict:
+            return (encoded_frames, scales, last_frame_pad_length)
+        return EncodecEncoderOutput(encoded_frames, scales, last_frame_pad_length)
+
+    @staticmethod
+    def _linear_overlap_add(frames: list[torch.Tensor], stride: int):
+        # Generic overlap add, with linear fade-in/fade-out, supporting complex scenario
+        # e.g., more than 2 frames per position.
+        # The core idea is to use a weight function that is a triangle,
+        # with a maximum value at the middle of the chunk.
+        # We use this weighting when summing the frames, and divide by the sum of weights
+        # for each positions at the end. Thus:
+        #   - if a frame is the only one to cover a position, the weighting is a no-op.
+        #   - if 2 frames cover a position:
+        #          ...  ...
+        #         /   \/   \
+        #        /    /\    \
+        #            S  T       , i.e. S offset of second frame starts, T end of first frame.
+        # Then the weight function for each one is: (t - S), (T - t), with `t` a given offset.
+        # After the final normalization, the weight of the second frame at position `t` is
+        # (t - S) / (t - S + (T - t)) = (t - S) / (T - S), which is exactly what we want.
+        #
+        #   - if more than 2 frames overlap at a given point, we hope that by induction
+        #      something sensible happens.
+        if len(frames) == 0:
+            raise ValueError("`frames` cannot be an empty list.")
+
+        device = frames[0].device
+        dtype = frames[0].dtype
+        shape = frames[0].shape[:-1]
+        total_size = stride * (len(frames) - 1) + frames[-1].shape[-1]
+
+        frame_length = frames[0].shape[-1]
+        time_vec = torch.linspace(0, 1, frame_length + 2, device=device, dtype=dtype)[1:-1]
+        weight = 0.5 - (time_vec - 0.5).abs()
+
+        sum_weight = torch.zeros(total_size, device=device, dtype=dtype)
+        out = torch.zeros(*shape, total_size, device=device, dtype=dtype)
+        offset: int = 0
+
+        for frame in frames:
+            frame_length = frame.shape[-1]
+            out[..., offset : offset + frame_length] += weight[:frame_length] * frame
+            sum_weight[offset : offset + frame_length] += weight[:frame_length]
+            offset += stride
+
+        if sum_weight.min() == 0:
+            raise ValueError(f"`sum_weight` minimum element must be bigger than zero: {sum_weight}`")
+
+        return out / sum_weight
+
+    def _decode_frame(self, codes: torch.Tensor, scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        codes = codes.transpose(0, 1)
+        embeddings = self.quantizer.decode(codes)
+        outputs = self.decoder(embeddings)
+        if scale is not None:
+            outputs = outputs * scale.view(-1, 1, 1)
+        return outputs
+
+    def decode(
+        self,
+        audio_codes: torch.LongTensor,
+        audio_scales: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        last_frame_pad_length: Optional[int] = 0,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecDecoderOutput]:
+        """
+        Decodes the given frames into an output audio waveform.
+
+        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+        trimmed.
+
+        Args:
+            audio_codes (`torch.LongTensor`  of shape `(nb_frames, batch_size, nb_quantizers, frame_len)`, *optional*):
+                Discrete code embeddings computed using `model.encode`.
+            audio_scales (list of length `nb_frames` of `torch.Tensor` of shape `(batch_size, 1)`, *optional*):
+                Scaling factor for each `audio_codes` input.
+            padding_mask (`torch.Tensor` of shape `(channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            last_frame_pad_length (`int`, *optional*):
+                Integer representing the length of the padding in the last frame, which is removed during decoding.
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        chunk_length = self.config.chunk_length
+        if chunk_length is None:
+            if len(audio_codes) != 1:
+                raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+            frame = audio_codes[0]
+            if last_frame_pad_length > 0:
+                frame = frame[..., :-last_frame_pad_length]
+            audio_values = self._decode_frame(frame, audio_scales[0])
+        else:
+            decoded_frames = []
+            for i, (frame, scale) in enumerate(zip(audio_codes, audio_scales)):
+                if i == len(audio_codes) - 1 and last_frame_pad_length > 0:
+                    frame = frame[..., :-last_frame_pad_length]
+                frames = self._decode_frame(frame, scale)
+                decoded_frames.append(frames)
+
+            audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)
+
+        # truncate based on padding mask
+        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
+            audio_values = audio_values[..., : padding_mask.shape[-1]]
+
+        if not return_dict:
+            return (audio_values,)
+        return EncodecDecoderOutput(audio_values)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: torch.FloatTensor,
+        padding_mask: Optional[torch.BoolTensor] = None,
+        bandwidth: Optional[float] = None,
+        audio_codes: Optional[torch.LongTensor] = None,
+        audio_scales: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        last_frame_pad_length: Optional[int] = 0,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+            Raw audio input converted to Float and padded to the appropriate length in order to be encoded using chunks
+            of length self.chunk_length and a stride of `config.chunk_stride`.
+        padding_mask (`torch.BoolTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+            Mask to avoid computing scaling factors on padding token indices (can we avoid computing conv on these+).
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            <Tip warning={true}>
+
+            `padding_mask` should always be passed, unless the input was truncated or not padded. This is because in
+            order to process tensors effectively, the input audio should be padded so that `input_length % stride =
+            step` with `step = chunk_length-stride`. This ensures that all chunks are of the same shape
+
+            </Tip>
+        bandwidth (`float`, *optional*):
+            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
+            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
+            `bandwidth == 6.0`
+        audio_codes (`torch.LongTensor`  of shape `(nb_frames, batch_size, nb_quantizers, frame_len)`, *optional*):
+            Discrete code embeddings computed using `model.encode`.
+        audio_scales (list of length `nb_frames` of `torch.Tensor` of shape `(batch_size, 1)`, *optional*):
+            Scaling factor for each `audio_codes` input.
+        return_dict (`bool`, *optional*):
+            Whether to return outputs as a dict.
+        last_frame_pad_length (`int`, *optional*):
+            The length of the padding in the last frame, if any. This is used to ensure that the encoded frames can be
+            outputted as a tensor. This value should be passed during decoding to ensure padding is removed from the
+            encoded frames.
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoProcessor, EncodecModel
+
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model_id = "facebook/encodec_24khz"
+        >>> model = EncodecModel.from_pretrained(model_id)
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+
+        >>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> audio_codes = outputs.audio_codes
+        >>> audio_values = outputs.audio_values
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+        else:
+            # ensure that channel dimension is present
+            padding_mask = padding_mask.view(padding_mask.shape[0], -1, padding_mask.shape[-1])
+
+        if audio_codes is not None and audio_scales is None:
+            raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")
+
+        if audio_scales is not None and audio_codes is None:
+            raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")
+
+        if audio_scales is None and audio_codes is None:
+            audio_codes, audio_scales, last_frame_pad_length = self.encode(
+                input_values, padding_mask, bandwidth, False
+            )
+
+        audio_values = self.decode(
+            audio_codes,
+            audio_scales,
+            padding_mask,
+            return_dict=return_dict,
+            last_frame_pad_length=last_frame_pad_length,
+        )[0]
+        if not return_dict:
+            return (audio_codes, audio_values)
+
+        return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)
+
+
+__all__ = ["EncodecModel", "EncodecPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__init__.py b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c786feb9213fdd31640c0fdeaead5164026ad37a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_encoder_decoder import *
+    from .modeling_encoder_decoder import *
+    from .modeling_flax_encoder_decoder import *
+    from .modeling_tf_encoder_decoder import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73ceb0418ad862f60f8fe064797c36e9318f6935
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45daa710d09cb9629488dba17dc38a0a569a4365
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29599714032dc0cacc79d23695974419b3ccdccc
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..795dd1efb41afbae89c07cc87936e9c2cbd56a6b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b4af488d24c7260c19f4b123513be0b320e9793
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/configuration_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..af57b2596cee99eefe0493cc4aea51c845036d2e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class EncoderDecoderConfig(PretrainedConfig):
+    r"""
+    [`EncoderDecoderConfig`] is the configuration class to store the configuration of a [`EncoderDecoderModel`]. It is
+    used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder
+    configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the encoder config.
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the decoder config.
+
+    Examples:
+
+    ```python
+    >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+
+    >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
+    >>> config_encoder = BertConfig()
+    >>> config_decoder = BertConfig()
+
+    >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+
+    >>> # Initializing a Bert2Bert model (with random weights) from the google-bert/bert-base-uncased style configurations
+    >>> model = EncoderDecoderModel(config=config)
+
+    >>> # Accessing the model configuration
+    >>> config_encoder = model.config.encoder
+    >>> config_decoder = model.config.decoder
+    >>> # set decoder config to causal lm
+    >>> config_decoder.is_decoder = True
+    >>> config_decoder.add_cross_attention = True
+
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("my-model")
+
+    >>> # loading model and config from pretrained folder
+    >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained("my-model")
+    >>> model = EncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
+    ```"""
+
+    model_type = "encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
+    has_no_defaults_at_init = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError(
+                f"A configuration of type {self.model_type} cannot be instantiated because "
+                f"both `encoder` and `decoder` sub-configurations were not passed, only {kwargs}"
+            )
+        encoder_config = kwargs.pop("encoder")
+        encoder_model_type = encoder_config.pop("model_type")
+        decoder_config = kwargs.pop("decoder")
+        decoder_model_type = decoder_config.pop("model_type")
+
+        self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
+        self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
+        self.is_encoder_decoder = True
+
+    @classmethod
+    def from_encoder_decoder_configs(
+        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
+    ) -> PretrainedConfig:
+        r"""
+        Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
+        decoder model configuration.
+
+        Returns:
+            [`EncoderDecoderConfig`]: An instance of a configuration object
+        """
+        logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+        decoder_config.is_decoder = True
+        decoder_config.add_cross_attention = True
+
+        return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
+
+
+__all__ = ["EncoderDecoderConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5ce450a9c7fd9cb843840ff2383bee80b137de7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -0,0 +1,608 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support Encoder-Decoder architectures"""
+
+import gc
+import inspect
+import os
+import tempfile
+import warnings
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...configuration_utils import PretrainedConfig
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from .configuration_encoder_decoder import EncoderDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+DEPRECATION_WARNING = (
+    "Version v4.12.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
+    " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
+    " fine-tuning a model trained with versions anterior to 4.12.0. The decoder_input_ids are now created based on the"
+    " labels, no need to pass them yourself anymore."
+)
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    if decoder_start_token_id is None:
+        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+@auto_docstring
+class EncoderDecoderModel(PreTrainedModel, GenerationMixin):
+    r"""
+    [`EncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one
+    of the base model classes of the library as encoder and another one as decoder when created with the
+    :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
+    :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
+    """
+
+    config: EncoderDecoderConfig
+    base_model_prefix = "encoder_decoder"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+    _supports_param_buffer_assignment = False
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    def __init__(
+        self,
+        config: Optional[PretrainedConfig] = None,
+        encoder: Optional[PreTrainedModel] = None,
+        decoder: Optional[PreTrainedModel] = None,
+    ):
+        r"""
+        encoder (`PreTrainedModel`, *optional*):
+            The encoder model to use.
+        decoder (`PreTrainedModel`, *optional*):
+            The decoder model to use.
+        """
+        if config is None and (encoder is None or decoder is None):
+            raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
+        if config is None:
+            config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"Config: {config} has to be of type {self.config_class}")
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+                    " `config.encoder.hidden_size`."
+                )
+
+        # initialize with config
+        super().__init__(config)
+
+        if encoder is None:
+            from ..auto.modeling_auto import AutoModel
+
+            encoder = AutoModel.from_config(config.encoder)
+
+        if decoder is None:
+            from ..auto.modeling_auto import AutoModelForCausalLM
+
+            decoder = AutoModelForCausalLM.from_config(config.decoder)
+
+        self.encoder = encoder
+        self.decoder = decoder
+
+        if self.encoder.config.to_dict() != self.config.encoder.to_dict():
+            logger.warning(
+                f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
+                f" {self.config.encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        # update `_attn_implementation` because the attn is set in a deepcopied config within PreTrainedModel
+        self.config.encoder._attn_implementation = self.encoder.config._attn_implementation
+        self.config.decoder._attn_implementation = self.decoder.config._attn_implementation
+        self.encoder.config = self.config.encoder
+        self.decoder.config = self.config.decoder
+
+        # encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.hidden_size)
+
+        if self.encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
+            )
+
+        decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
+        # tie encoder, decoder weights if config set accordingly
+        self.tie_weights()
+
+    def tie_weights(self):
+        self.encoder.tie_weights()
+        self.decoder.tie_weights()
+        # tie encoder & decoder if needed
+        if self.config.tie_encoder_decoder:
+            # tie encoder and decoder base model
+            decoder_base_model_prefix = self.decoder.base_model_prefix
+            tied_weights = self._tie_encoder_decoder_weights(
+                self.encoder,
+                self.decoder._modules[decoder_base_model_prefix],
+                self.decoder.base_model_prefix,
+                "encoder",
+            )
+            # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class
+            # attributed not an instance member, therefore modifying it will modify the entire class
+            # Leading to issues on subsequent calls by different tests or subsequent calls.
+            self._dynamic_tied_weights_keys = tied_weights
+
+    def _init_weights(self, module):
+        if module in self.encoder.modules():
+            self.encoder._init_weights(module)
+        elif module in self.decoder.modules():
+            self.decoder._init_weights(module)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_input_embeddings(self):
+        return self.encoder.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        return self.decoder.set_output_embeddings(new_embeddings)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import EncoderDecoderModel
+
+        >>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+        ```"""
+
+        from_tf = kwargs.pop("from_tf", False)
+        if from_tf:
+            from transformers import TFEncoderDecoderModel
+
+            # a workaround to load from tensorflow checkpoint
+            # Using `_tf_model` won't work, because the weight names in the encoder/decoder of `_tf_model` get
+            # extended before saving those components. For example, The name of `_tf_model.encoder.vit` is
+            # `[top model name]/encoder/vit`, but the name of `tf_model.encoder.vit` is `[top model name]/vit`. The
+            # [top model name] is handled (stripped) by the conversion method, and the former case gets extra `encoder`,
+            # which should not occur when we want to save the components alone.
+            # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see
+            #   https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245
+            #   (the change in `src/transformers/modeling_tf_utils.py`)
+            _tf_model = TFEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            config = _tf_model.config
+
+            # Using `tf_model` instead
+            encoder = _tf_model.encoder.__class__(_tf_model.config.encoder)
+            decoder = _tf_model.decoder.__class__(_tf_model.config.decoder)
+            # Make sure models are built
+            encoder(encoder.dummy_inputs)
+            decoder(decoder.dummy_inputs)
+
+            # Get the variable correspondence between `_tf_model` and `encoder` and `decoder`
+            encoder_variables = {}
+            for v in encoder.trainable_variables + encoder.non_trainable_variables:
+                encoder_variables["/".join(v.name.split("/")[1:])] = v
+            decoder_variables = {}
+            for v in decoder.trainable_variables + decoder.non_trainable_variables:
+                decoder_variables["/".join(v.name.split("/")[1:])] = v
+
+            _encoder_variables = {}
+            for v in _tf_model.encoder.trainable_variables + _tf_model.encoder.non_trainable_variables:
+                _encoder_variables["/".join(v.name.split("/")[2:])] = v
+            _decoder_variables = {}
+            for v in _tf_model.decoder.trainable_variables + _tf_model.decoder.non_trainable_variables:
+                _decoder_variables["/".join(v.name.split("/")[2:])] = v
+
+            # assign weight values to `encoder` and `decoder` from `_tf_model`
+            for name, v in encoder_variables.items():
+                v.assign(_encoder_variables[name])
+            for name, v in decoder_variables.items():
+                v.assign(_decoder_variables[name])
+
+            tf_model = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)
+
+            # Deal with `enc_to_dec_proj`
+            if hasattr(_tf_model, "enc_to_dec_proj"):
+                tf_model(tf_model.dummy_inputs)
+                tf_model.enc_to_dec_proj.kernel.assign(_tf_model.enc_to_dec_proj.kernel)
+                tf_model.enc_to_dec_proj.bias.assign(_tf_model.enc_to_dec_proj.bias)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                encoder_dir = os.path.join(tmpdirname, "encoder")
+                decoder_dir = os.path.join(tmpdirname, "decoder")
+                tf_model.encoder.save_pretrained(encoder_dir)
+                tf_model.decoder.save_pretrained(decoder_dir)
+
+                if hasattr(tf_model, "enc_to_dec_proj"):
+                    enc_to_dec_proj_weight = torch.transpose(
+                        torch.from_numpy(tf_model.enc_to_dec_proj.kernel.numpy()), 1, 0
+                    )
+                    enc_to_dec_proj_bias = torch.from_numpy(tf_model.enc_to_dec_proj.bias.numpy())
+
+                del _tf_model
+                del tf_model
+                gc.collect()
+
+                model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+                    encoder_dir, decoder_dir, encoder_from_tf=True, decoder_from_tf=True
+                )
+                # This is only for copying some specific attributes of this particular model.
+                model.config = config
+
+                if hasattr(model, "enc_to_dec_proj"):
+                    model.enc_to_dec_proj.weight.data = enc_to_dec_proj_weight.contiguous()
+                    model.enc_to_dec_proj.bias.data = enc_to_dec_proj_bias.contiguous()
+
+                return model
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        r"""
+        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+        checkpoints.
+
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you need to first set it back in training mode with `model.train()`.
+
+        Params:
+            encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import EncoderDecoderModel
+
+        >>> # initialize a bert2bert from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./bert2bert")
+        >>> # load fine-tuned model
+        >>> model = EncoderDecoderModel.from_pretrained("./bert2bert")
+        ```"""
+
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder:
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder:
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_encoder:
+                encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
+                    encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and causal mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_encoder["config"] = encoder_config
+
+            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+
+            decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # instantiate config with corresponding kwargs
+        config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+        return cls(encoder=encoder, decoder=decoder, config=config)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            For training, `decoder_input_ids` are automatically created by the model by shifting the `labels` to the
+            right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`.
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert `decoder_input_ids` indices
+            into associated vectors than the model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0,
+            ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from transformers import EncoderDecoderModel, BertTokenizer
+        >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google-bert/bert-base-uncased", "google-bert/bert-base-uncased"
+        ... )  # initialize Bert2Bert from pre-trained checkpoints
+
+        >>> # training
+        >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
+        >>> model.config.pad_token_id = tokenizer.pad_token_id
+        >>> model.config.vocab_size = model.config.decoder.vocab_size
+
+        >>> input_ids = tokenizer("This is a really long text", return_tensors="pt").input_ids
+        >>> labels = tokenizer("This is the corresponding summary", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss, logits = outputs.loss, outputs.logits
+
+        >>> # save and load from pretrained
+        >>> model.save_pretrained("bert2bert")
+        >>> model = EncoderDecoderModel.from_pretrained("bert2bert")
+
+        >>> # generation
+        >>> generated = model.generate(input_ids)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+        if "num_items_in_batch" in kwargs_encoder:
+            kwargs_decoder["num_items_in_batch"] = kwargs_encoder.pop("num_items_in_batch", None)
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs_encoder,
+            )
+        elif isinstance(encoder_outputs, tuple):
+            encoder_outputs = BaseModelOutput(*encoder_outputs)
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+            if decoder_attention_mask is None:
+                decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+            **kwargs_decoder,
+        )
+
+        # Compute loss independent from decoder (as some shift the logits inside them)
+        loss = None
+        if labels is not None:
+            warnings.warn(DEPRECATION_WARNING, FutureWarning)
+            logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            if loss is not None:
+                return (loss,) + decoder_outputs + encoder_outputs
+            else:
+                return decoder_outputs + encoder_outputs
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def resize_token_embeddings(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
+            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
+            " model.decoder.resize_token_embeddings(...))"
+        )
+
+
+__all__ = ["EncoderDecoderModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a27c23c3c69ae928c73273c9397d5f5aad2b1c0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -0,0 +1,901 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support Flax Encoder-Decoder architectures"""
+
+import os
+from typing import Optional, Union
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
+from ...modeling_flax_utils import FlaxPreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
+from .configuration_encoder_decoder import EncoderDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "EncoderDecoderConfig"
+
+ENCODER_DECODER_START_DOCSTRING = r"""
+    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
+    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
+    [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
+    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
+    generative task, like summarization.
+
+    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
+    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
+    Tasks](https://huggingface.co/papers/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+    Zhou, Wei Li, Peter J. Liu.
+
+    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
+    (see the examples for more information).
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+ENCODER_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
+            created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
+            and prepending them with the `decoder_start_token_id`.
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.encoder.max_position_embeddings - 1]`.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.decoder.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
+"""
+
+ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.encoder.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
+"""
+
+ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
+            created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
+            and prepending them with the `decoder_start_token_id`.
+        encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.decoder.max_position_embeddings - 1]`.
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxCausalLMOutputWithCrossAttentions`] instead of a
+            plain tuple.
+"""
+
+
+class FlaxEncoderDecoderModule(nn.Module):
+    config: EncoderDecoderConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        encoder_config = self.config.encoder
+        decoder_config = self.config.decoder
+
+        # Copied from `modeling_hybrid_clip.py` with modifications.
+        from ...models.auto.modeling_flax_auto import FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, FLAX_MODEL_MAPPING
+
+        encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class
+        decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class
+
+        self.encoder = encoder_module(encoder_config, dtype=self.dtype)
+        self.decoder = decoder_module(decoder_config, dtype=self.dtype)
+
+        # encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Dense(
+                self.decoder.config.hidden_size,
+                kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
+                dtype=self.dtype,
+            )
+        else:
+            self.enc_to_dec_proj = None
+
+    def _get_encoder_module(self):
+        return self.encoder
+
+    def _get_projection_module(self):
+        return self.enc_to_dec_proj
+
+    def _get_decoder_module(self):
+        return self.decoder
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        position_ids,
+        decoder_position_ids,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if self.enc_to_dec_proj is not None:
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return FlaxSeq2SeqLMOutput(
+            logits=decoder_outputs.logits,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
+class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
+    r"""
+    [`FlaxEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
+    the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one as
+    decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the
+    encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
+    """
+
+    config_class = EncoderDecoderConfig
+    base_model_prefix = "encoder_decoder"
+    module_class = FlaxEncoderDecoderModule
+
+    def __init__(
+        self,
+        config: EncoderDecoderConfig,
+        input_shape: Optional[tuple] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        if input_shape is None:
+            input_shape = ((1, 1), (1, 1))
+
+        if not _do_init:
+            raise ValueError(
+                "`FlaxEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
+            )
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+                    " `config.encoder.hidden_size`."
+                )
+
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        encoder_input_shape, decoder_input_shape = input_shape
+
+        # init input tensors
+        input_ids = jnp.zeros(encoder_input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+
+        batch_size, sequence_length = input_ids.shape
+        position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape
+        if not decoder_batch_size == batch_size:
+            raise ValueError(
+                f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder"
+                f" and {decoder_batch_size} for decoder."
+            )
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
+        )
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            position_ids,
+            decoder_position_ids,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+        )
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings(ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def encode(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+
+        >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
+
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> input_ids = tokenizer.encode(text, return_tensors="np")
+        >>> encoder_outputs = model.encode(input_ids)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            batch_size, sequence_length = input_ids.shape
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_ids, attention_mask, position_ids, **kwargs)
+
+        outputs = self.module.apply(
+            {"params": params or self.params},
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            position_ids=jnp.array(position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+
+        if return_dict:
+            outputs = FlaxBaseModelOutput(
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+        return outputs
+
+    @add_start_docstrings(ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: Optional[dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+        >>> import jax.numpy as jnp
+
+        >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
+
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors="np")
+        >>> encoder_outputs = model.encode(input_ids)
+
+        >>> decoder_start_token_id = model.config.decoder.bos_token_id
+        >>> decoder_input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+        if encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxBartAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(
+            module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
+        ):
+            projection_module = module._get_projection_module()
+            decoder_module = module._get_decoder_module()
+
+            # optionally project encoder_hidden_states
+            if projection_module is not None:
+                encoder_hidden_states = projection_module(encoder_hidden_states)
+
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                encoder_hidden_states=encoder_hidden_states,
+                **kwargs,
+            )
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_input_ids: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer, GPT2Tokenizer
+
+        >>> # load a fine-tuned bert2gpt2 model
+        >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
+        >>> # load input & output tokenizer
+        >>> tokenizer_input = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+        >>> tokenizer_output = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+
+        >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
+        >>> singing a racist chant. SAE's national chapter suspended the students,
+        >>> but University of Oklahoma President David Boren took it a step further,
+        >>> saying the university's affiliation with the fraternity is permanently done.'''
+
+        >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors="np").input_ids
+
+        >>> # use GPT2's eos_token as the pad as well as eos token
+        >>> model.config.eos_token_id = model.config.decoder.eos_token_id
+        >>> model.config.pad_token_id = model.config.eos_token_id
+
+        >>> sequences = model.generate(input_ids, num_beams=4, max_length=12).sequences
+
+        >>> summary = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)[0]
+        >>> assert summary == "SAS Alpha Epsilon suspended Sigma Alpha Epsilon members"
+        ```
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # prepare encoder inputs
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            batch_size, sequence_length = input_ids.shape
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        # prepare decoder inputs
+        if decoder_input_ids is None:
+            raise ValueError(
+                "`decoder_input_ids` cannot be `None`. For sequence to sequence training, `decoder_position_ids` must"
+                " be specified as an input argument."
+            )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        if decoder_position_ids is None:
+            batch_size, sequence_length = decoder_input_ids.shape
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            position_ids=jnp.array(position_ids, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
+            )
+
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": decoder_position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        *model_args,
+        **kwargs,
+    ) -> FlaxPreTrainedModel:
+        r"""
+        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+        checkpoints.
+
+        Params:
+            encoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*):
+                Information necessary to initiate the encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            decoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import FlaxEncoderDecoderModel
+
+        >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./bert2gpt2")
+        >>> # load fine-tuned model
+        >>> model = FlaxEncoderDecoderModel.from_pretrained("./bert2gpt2")
+        ```"""
+
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder:
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder:
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_encoder:
+                encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
+                    encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
+                )
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and causal mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_encoder["config"] = encoder_config
+
+            encoder = FlaxAutoModel.from_pretrained(
+                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
+            )
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+
+            decoder = FlaxAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # instantiate config with corresponding kwargs
+        dtype = kwargs.pop("dtype", jnp.float32)
+        config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+
+        # init model
+        model = cls(config, dtype=dtype)
+        model.params["encoder"] = encoder.params
+        model.params["decoder"] = decoder.params
+
+        return model
+
+
+__all__ = ["FlaxEncoderDecoderModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5343d200499e1f3b8ba26f8d70924c2999a2fc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -0,0 +1,661 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support TF Encoder-Decoder architectures"""
+
+from __future__ import annotations
+
+import inspect
+import re
+import warnings
+
+import numpy as np
+import tensorflow as tf
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    keras,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM
+from .configuration_encoder_decoder import EncoderDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "EncoderDecoderConfig"
+
+DEPRECATION_WARNING = (
+    "Version v4.17.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
+    " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
+    " fine-tuning a model trained with versions anterior to 4.17.0. The decoder_input_ids are now created based on the"
+    " labels, no need to pass them yourself anymore."
+)
+
+ENCODER_DECODER_START_DOCSTRING = r"""
+    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
+    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
+    [`~TFAutoModel.from_pretrained`] function and the decoder is loaded via [`~TFAutoModelForCausalLM.from_pretrained`]
+    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
+    generative task, like summarization.
+
+    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
+    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
+    Tasks](https://huggingface.co/papers/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+    Zhou, Wei Li, Peter J. Liu.
+
+    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
+    (see the examples for more information).
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ENCODER_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            Provide for sequence to sequence training to the decoder. Indices can be obtained using
+            [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            details.
+        decoder_attention_mask (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        encoder_outputs (`tuple(tuple(tf.Tensor)`, *optional*):
+            This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` (`tf.Tensor` of shape `({0}, hidden_size)`) is a tensor of hidden-states at the output
+            of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(tf.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `({0})`.
+        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert `decoder_input_ids` indices
+            into associated vectors than the model's internal embedding lookup matrix.
+        labels (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0,
+            ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+        kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
+
+            - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function.
+            - With a *decoder_* prefix which will be input as `**decoder_kwargs`` for the decoder forward function.
+"""
+
+
+def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    if pad_token_id is None:
+        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
+
+    if decoder_start_token_id is None:
+        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
+
+    start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
+    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids = tf.where(
+        shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
+    )
+
+    # "Verify that `labels` has only positive values and -100"
+    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
+
+    # Make sure the assertion op is called by wrapping the result in an identity no-op
+    with tf.control_dependencies([assert_gte0]):
+        shifted_input_ids = tf.identity(shifted_input_ids)
+
+    return shifted_input_ids
+
+
+@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
+class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
+    r"""
+    [`TFEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one
+    of the base model classes of the library as encoder and another one as decoder when created with the
+    [`~TFAutoModel.from_pretrained`] class method for the encoder and [`~TFAutoModelForCausalLM.from_pretrained`] class
+    method for the decoder.
+    """
+
+    config_class = EncoderDecoderConfig
+    base_model_prefix = "encoder_decoder"
+    load_weight_prefix = "tf_encoder_decoder_model"
+
+    def __init__(
+        self,
+        config: PretrainedConfig | None = None,
+        encoder: TFPreTrainedModel | None = None,
+        decoder: TFPreTrainedModel | None = None,
+    ):
+        if config is None and (encoder is None or decoder is None):
+            raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
+        if config is None:
+            config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"config: {config} has to be of type {self.config_class}")
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+                    " `config.encoder.hidden_size`."
+                )
+
+        # initialize with config
+        super().__init__(config)
+
+        if encoder is None:
+            encoder = TFAutoModel.from_config(config.encoder, name="encoder")
+
+        if decoder is None:
+            decoder = TFAutoModelForCausalLM.from_config(config.decoder, name="decoder")
+
+        self.encoder = encoder
+        self.decoder = decoder
+
+        if self.encoder.config.to_dict() != self.config.encoder.to_dict():
+            logger.warning(
+                f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
+                f" {self.config.encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.encoder.config = self.config.encoder
+        self.decoder.config = self.config.decoder
+
+        # encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = keras.layers.Dense(
+                units=self.decoder.config.hidden_size,
+                kernel_initializer=get_initializer(config.encoder.initializer_range),
+                name="enc_to_dec_proj",
+            )
+
+        if self.encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
+            )
+
+        decoder_signature = set(inspect.signature(self.decoder.call).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_input_embeddings(self):
+        return self.encoder.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        return self.decoder.set_output_embeddings(new_embeddings)
+
+    def tf_to_pt_weight_rename(self, tf_weight):
+        # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
+        # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
+        # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
+        # here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's
+        # not the case, and I wasn't sure how else to go from the config to the correct MainLayer name!
+
+        # This override is only needed in the case where we're crossloading weights from PT. However, since weights are
+        # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file.
+        # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it
+        # or not.
+        encoder_model_type = self.config.encoder.model_type
+        if "encoder" in tf_weight and "decoder" not in tf_weight:
+            return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),)
+        else:
+            return (tf_weight,)
+
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: str | None = None,
+        decoder_pretrained_model_name_or_path: str | None = None,
+        *model_args,
+        **kwargs,
+    ) -> TFPreTrainedModel:
+        r"""
+        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+        checkpoints.
+
+
+        Params:
+            encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case,
+                      `encoder_from_pt` should be set to `True`.
+
+            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case,
+                      `decoder_from_pt` should be set to `True`.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import TFEncoderDecoderModel
+
+        >>> # initialize a bert2gpt2 from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "openai-community/gpt2")
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./bert2gpt2")
+        >>> # load fine-tuned model
+        >>> model = TFEncoderDecoderModel.from_pretrained("./bert2gpt2")
+        ```"""
+
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder:
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder:
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_encoder:
+                encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path)
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and causal mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_encoder["config"] = encoder_config
+
+            kwargs_encoder["name"] = "encoder"
+            kwargs_encoder["load_weight_prefix"] = cls.load_weight_prefix
+            encoder = TFAutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path)
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+
+            kwargs_decoder["name"] = "decoder"
+            kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix
+            decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly.
+        if encoder.name != "encoder":
+            raise ValueError("encoder model must be created with the name `encoder`.")
+        if decoder.name != "decoder":
+            raise ValueError("decoder model must be created with the name `decoder`.")
+
+        # instantiate config with corresponding kwargs
+        config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+        return cls(encoder=encoder, decoder=decoder, config=config)
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> TFSeq2SeqLMOutput | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TFEncoderDecoderModel, BertTokenizer
+
+        >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
+
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+        >>> # forward
+        >>> input_ids = tokenizer.encode(
+        ...     "Hello, my dog is cute", add_special_tokens=True, return_tensors="tf"
+        ... )  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+
+        >>> # training
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
+        >>> loss, logits = outputs.loss, outputs.logits
+
+        >>> # save and load from pretrained
+        >>> model.save_pretrained("bert2gpt2")
+        >>> model = TFEncoderDecoderModel.from_pretrained("bert2gpt2")
+
+        >>> # generation
+        >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.bos_token_id)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # Let the user be responsible for the expected format.
+        if encoder_outputs is not None:
+            if return_dict and not isinstance(encoder_outputs, ModelOutput):
+                raise ValueError(
+                    "If `return_dict=True` and `encoder_outputs` is provided, it should be an instance of "
+                    f"`ModelOutput`. Got an instance {type(encoder_outputs)} for `encoder_outputs`."
+                )
+
+        if encoder_outputs is None:
+            encoder_inputs = {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "inputs_embeds": inputs_embeds,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+                "training": training,
+            }
+
+            # Add arguments to encoder from `kwargs_encoder`
+            encoder_inputs.update(kwargs_encoder)
+
+            # Handle the case where the inputs are passed as a single dict which contains `labels`.
+            # The `labels` shouldn't be passed to `self.encoder` below, because it is a based model without this
+            # parameter (otherwise, an error occurs when `input_processing` is called inside `self.encoder.call()`).
+            if "labels" in encoder_inputs:
+                labels = encoder_inputs.pop("labels")
+
+            # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`.
+            if "decoder_input_ids" in encoder_inputs:
+                decoder_input_ids = encoder_inputs.pop("decoder_input_ids")
+            # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`.
+            if "decoder_attention_mask" in encoder_inputs:
+                decoder_attention_mask = encoder_inputs.pop("decoder_attention_mask")
+
+            encoder_outputs = self.encoder(**encoder_inputs)
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        decoder_inputs = {
+            "input_ids": decoder_input_ids,
+            "attention_mask": decoder_attention_mask,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_attention_mask": attention_mask,
+            "inputs_embeds": decoder_inputs_embeds,
+            "output_attentions": output_attentions,
+            "output_hidden_states": output_hidden_states,
+            "use_cache": use_cache,
+            "past_key_values": past_key_values,
+            "return_dict": return_dict,
+            "training": training,
+        }
+
+        # Add arguments to decoder from `kwargs_decoder`
+        decoder_inputs.update(kwargs_decoder)
+
+        decoder_outputs = self.decoder(**decoder_inputs)
+
+        logits = decoder_outputs[0]
+
+        # Compute loss independent from decoder (as some shift the logits inside them)
+        loss = None
+        if labels is not None:
+            warnings.warn(DEPRECATION_WARNING, FutureWarning)
+            loss = self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            past_key_values = None
+            if use_cache:
+                past_key_values = decoder_outputs[1]
+            # The starting index of the remaining elements in `decoder_outputs`
+            start_index = sum([1 if x is not None else 0 for x in (loss, logits, past_key_values)])
+
+            if not isinstance(encoder_outputs, tuple):
+                encoder_outputs = encoder_outputs.to_tuple()
+            output = (loss, logits, past_key_values) + decoder_outputs[start_index:] + encoder_outputs
+            output = tuple(x for x in output if x is not None)
+            return output
+
+        return TFSeq2SeqLMOutput(
+            loss=loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
+        decoder_attention_mask = decoder_inputs.get("attention_mask", None)
+        past_key_values = decoder_inputs.get("past_key_values")
+        if past_key_values is None:
+            past_key_values = decoder_inputs.get("past")  # e.g. on TF GPT2
+        input_dict = {
+            "input_ids": None,  # needs to be passed to make Keras.layer.__call__ happy
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_input_ids": decoder_inputs["input_ids"],
+            # TODO (joao): the `TFBaseModelOutput` wrapper should not be needed after the generate refactor is complete
+            "encoder_outputs": TFBaseModelOutput(last_hidden_state=encoder_outputs[0]),
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+        return input_dict
+
+    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def resize_token_embeddings(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Resizing the embedding layers via the TFEncoderDecoderModel directly is not supported.Please use the"
+            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
+            " model.decoder.resize_token_embeddings(...))"
+        )
+
+    def _reorder_cache(self, past, beam_idx):
+        # apply decoder cache reordering here
+        return self.decoder._reorder_cache(past, beam_idx)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "enc_to_dec_proj", None) is not None:
+            with tf.name_scope(self.enc_to_dec_proj.name):
+                self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size])
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+
+
+__all__ = ["TFEncoderDecoderModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/__init__.py b/phivenv/Lib/site-packages/transformers/models/eomt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f4fe6327b312ff5f60ffb08c4b76566bf63f3f9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/eomt/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_eomt import *
+    from .image_processing_eomt import *
+    from .image_processing_eomt_fast import *
+    from .modeling_eomt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d57a4af9cb5f9ef77193c2a0ee898ec3dce1c75
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/configuration_eomt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/configuration_eomt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8f646828a2fc381f2967aa69f10216d64e9f70f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/configuration_eomt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/image_processing_eomt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/image_processing_eomt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0010e6b6593742c199c545909b3aaa9f3cd534a2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/image_processing_eomt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/image_processing_eomt_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/image_processing_eomt_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd81b3e84e3208b24745971f6bc65b6f6b617ef5
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/image_processing_eomt_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/modeling_eomt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/modeling_eomt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c560669c2749a9d505a71d59b3ad49d23485e56
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/modeling_eomt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/modular_eomt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/modular_eomt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fcd701830d69230abab3feb13a3191c18639e13
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/eomt/__pycache__/modular_eomt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/configuration_eomt.py b/phivenv/Lib/site-packages/transformers/models/eomt/configuration_eomt.py
new file mode 100644
index 0000000000000000000000000000000000000000..670250721150e60df3d5da9280197cdad461beef
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/eomt/configuration_eomt.py
@@ -0,0 +1,168 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/eomt/modular_eomt.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_eomt.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class EomtConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the EoMT
+    [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads in each attention layer.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the MLP hidden dimensionality to the hidden size.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 640):
+            The size (resolution) of each input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+            Initial value for the LayerScale parameter.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The stochastic depth rate (drop path) used during training.
+        num_upscale_blocks (`int`, *optional*, defaults to 2):
+            Number of upsampling blocks used in the decoder or segmentation head.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability applied after attention projection.
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_blocks (`int`, *optional*, defaults to 4):
+            Number of feature blocks or stages in the architecture.
+        no_object_weight (`float`, *optional*, defaults to 0.1):
+            Loss weight for the 'no object' class in panoptic/instance segmentation.
+        class_weight (`float`, *optional*, defaults to 2.0):
+            Loss weight for classification targets.
+        mask_weight (`float`, *optional*, defaults to 5.0):
+            Loss weight for mask prediction.
+        dice_weight (`float`, *optional*, defaults to 5.0):
+            Loss weight for the dice loss component.
+        train_num_points (`int`, *optional*, defaults to 12544):
+            Number of points to sample for mask loss computation during training.
+        oversample_ratio (`float`, *optional*, defaults to 3.0):
+            Oversampling ratio used in point sampling for mask training.
+        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
+            Ratio of points to sample based on importance during training.
+        num_queries (`int`, *optional*, defaults to 200):
+            Number of object queries in the Transformer.
+        num_register_tokens (`int`, *optional*, defaults to 4):
+            Number of learnable register tokens added to the transformer input.
+
+    Example:
+
+    ```python
+    >>> from transformers import EomtConfig, EomtForUniversalSegmentation
+
+    >>> # Initialize configuration
+    >>> config = EomtConfig()
+
+    >>> # Initialize model
+    >>> model = EomtForUniversalSegmentation(config)
+
+    >>> # Access config
+    >>> config = model.config
+    ```"""
+
+    model_type = "eomt"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=640,
+        patch_size=16,
+        num_channels=3,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        num_upscale_blocks=2,
+        attention_dropout=0.0,
+        use_swiglu_ffn=False,
+        num_blocks=4,
+        no_object_weight: float = 0.1,
+        class_weight: float = 2.0,
+        mask_weight: float = 5.0,
+        dice_weight: float = 5.0,
+        train_num_points: int = 12544,
+        oversample_ratio: float = 3.0,
+        importance_sample_ratio: float = 0.75,
+        num_queries=200,
+        num_register_tokens=4,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+
+        self.mlp_ratio = mlp_ratio
+        self.attention_dropout = attention_dropout
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.num_upscale_blocks = num_upscale_blocks
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.num_blocks = num_blocks
+        self.no_object_weight = no_object_weight
+        self.class_weight = class_weight
+        self.mask_weight = mask_weight
+        self.dice_weight = dice_weight
+        self.train_num_points = train_num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.num_queries = num_queries
+        self.num_register_tokens = num_register_tokens
+
+
+__all__ = ["EomtConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/image_processing_eomt.py b/phivenv/Lib/site-packages/transformers/models/eomt/image_processing_eomt.py
new file mode 100644
index 0000000000000000000000000000000000000000..05131c543d83b3d0276b68e6d66cfb943de1b7fb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/eomt/image_processing_eomt.py
@@ -0,0 +1,974 @@
+# coding=utf-8
+# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for EoMT."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    pad,
+    resize,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_flat_list_of_images,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+    import torch.nn.functional as F
+
+
+# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
+def convert_segmentation_map_to_binary_masks(
+    segmentation_map: "np.ndarray",
+    instance_id_to_semantic_id: Optional[dict[int, int]] = None,
+    ignore_index: Optional[int] = None,
+):
+    if ignore_index is not None:
+        segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1)
+
+    # Get unique ids (class or instance ids based on input)
+    all_labels = np.unique(segmentation_map)
+
+    # Drop background label if applicable
+    if ignore_index is not None:
+        all_labels = all_labels[all_labels != ignore_index]
+
+    # Generate a binary mask for each object instance
+    binary_masks = [(segmentation_map == i) for i in all_labels]
+
+    # Stack the binary masks
+    if binary_masks:
+        binary_masks = np.stack(binary_masks, axis=0)
+    else:
+        binary_masks = np.zeros((0, *segmentation_map.shape))
+
+    # Convert instance ids to class ids
+    if instance_id_to_semantic_id is not None:
+        labels = np.zeros(all_labels.shape[0])
+
+        for label in all_labels:
+            class_id = instance_id_to_semantic_id[label + 1 if ignore_index is not None else label]
+            labels[all_labels == label] = class_id - 1 if ignore_index is not None else class_id
+    else:
+        labels = all_labels
+
+    return binary_masks.astype(np.float32), labels.astype(np.int64)
+
+
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = round(raw_size * height / width)
+        else:
+            oh = round(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = round(raw_size * width / height)
+        else:
+            ow = round(size * width / height)
+
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_mask = mask_probs[k] >= mask_threshold
+    original_area = original_mask.sum()
+
+    final_mask = mask_k & original_mask
+    final_mask_area = final_mask.sum()
+
+    mask_exists = mask_k_area > 0 and original_area > 0 and final_mask_area > 0
+
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, final_mask
+
+
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    stuff_classes,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    target_size: Optional[tuple[int, int]] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.long, device=mask_probs.device) - 1
+    segments: list[dict] = []
+
+    # Compute per-pixel assignment based on weighted mask scores
+    mask_probs = mask_probs.sigmoid()
+    mask_labels = (pred_scores[:, None, None] * mask_probs).argmax(0)
+
+    # Keep track of instances of each class
+    current_segment_id = 0
+    stuff_memory_list: dict[str, int] = {}
+
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, final_mask = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if not mask_exists:
+            continue
+
+        if stuff_classes and pred_class in stuff_classes:
+            if pred_class in stuff_memory_list:
+                segmentation[final_mask] = stuff_memory_list[pred_class]
+                continue
+            else:
+                stuff_memory_list[pred_class] = current_segment_id
+
+        segmentation[final_mask] = current_segment_id
+        segment_score = round(pred_scores[k].item(), 6)
+        segments.append(
+            {
+                "id": current_segment_id,
+                "label_id": pred_class,
+                "score": segment_score,
+            }
+        )
+        current_segment_id += 1
+    return segmentation, segments
+
+
+def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
+    """Returns the height and width from a size dict."""
+    target_height = size_dict["shortest_edge"]
+    target_width = size_dict.get("longest_edge") or target_height
+
+    return target_height, target_width
+
+
+class EomtImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EoMT image processor. The image processor can be used to prepare image(s) and optional targets
+    for the model.
+
+    This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 640):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
+            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
+            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
+            height / width, size)`.
+        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
+            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
+            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
+            to `True`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input to a certain `scale`.
+        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
+            Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        do_split_image (`bool`, *optional*, defaults to `False`):
+            Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
+            input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
+            Otherwise, the input images will be padded to the target size.
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+            number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
+            ImageNet std.
+        ignore_index (`int`, *optional*):
+            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+            denoted with 0 (background) will be replaced with `ignore_index`.
+        num_labels (`int`, *optional*):
+            The number of labels in the segmentation map.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        do_split_image: bool = False,
+        do_pad: bool = False,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        ignore_index: Optional[int] = None,
+        num_labels: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        size = size if size is not None else {"shortest_edge": 640, "longest_edge": 640}
+        size = get_size_dict(size, default_to_square=False)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_split_image = do_split_image
+        self.do_pad = do_pad
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.ignore_index = ignore_index
+        self.num_labels = num_labels
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format=None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        image_size = get_image_size(image)
+        output_size = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"])
+
+        image = resize(
+            image=image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            return_numpy=True,
+            **kwargs,
+        )
+
+        return image
+
+    def _split_image(self, image: ImageInput, size: dict, image_index: int) -> tuple[list, list]:
+        """Slices an image into overlapping patches for semantic segmentation."""
+
+        patches, patch_offsets = [], []
+
+        image_size = get_image_size(image)
+        patch_size = size["shortest_edge"]
+
+        longer_side = max(image_size)
+        num_patches = math.ceil(longer_side / patch_size)
+        total_overlap = num_patches * patch_size - longer_side
+        overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0
+
+        for i in range(num_patches):
+            start = int(i * (patch_size - overlap_per_patch))
+            end = start + patch_size
+
+            if image_size[0] > image_size[1]:
+                patch = image[:, start:end, :]
+            else:
+                patch = image[:, :, start:end]
+
+            patches.append(patch)
+            patch_offsets.append([image_index, start, end])
+
+        return patches, patch_offsets
+
+    def _pad(self, image: ImageInput, size: dict) -> np.ndarray:
+        """Pads the image to the target size using zero padding."""
+        height, width = get_image_size(image)
+
+        target_height, target_width = get_target_size(size)
+        pad_h = max(0, target_height - height)
+        pad_w = max(0, target_width - width)
+
+        padding = ((0, pad_h), (0, pad_w))
+
+        # Channel axis is last; default padding format is compatible
+        padded_image = pad(image=image, padding=padding, mode=PaddingMode.CONSTANT, constant_values=0.0)
+        return padded_image
+
+    def _preprocess_images(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_split_image: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a batch of images."""
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [
+                self.resize(
+                    image,
+                    size=size,
+                    resample=resample,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+
+        processed_images, patch_offsets = [], []
+
+        if do_split_image:
+            for idx, img in enumerate(images):
+                patches, offsets = self._split_image(img, size, idx)
+                processed_images.extend(patches)
+                patch_offsets.extend(offsets)
+
+            images = processed_images
+
+        if do_pad:
+            images = [self._pad(img, size) for img in images]
+
+        if do_rescale:
+            images = [self.rescale(img, scale=rescale_factor, input_data_format=input_data_format) for img in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(
+                    image,
+                    mean=image_mean,
+                    std=image_std,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+
+        return images, patch_offsets
+
+    def _preprocess_mask(
+        self,
+        segmentation_map: ImageInput,
+        do_resize: Optional[bool] = False,
+        do_pad: Optional[bool] = False,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        data_format: Union[str, ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single mask."""
+        # Add channel dimension if missing - needed for certain transformations
+        if segmentation_map.ndim == 2:
+            added_channel_dim = True
+            segmentation_map = segmentation_map[None, ...]
+            input_data_format = ChannelDimension.FIRST
+        else:
+            added_channel_dim = False
+            if input_data_format is None:
+                input_data_format = infer_channel_dimension_format(segmentation_map)
+
+        if do_resize:
+            segmentation_map = self.resize(
+                segmentation_map,
+                size=size,
+                resample=resample,
+                data_format=data_format,
+            )
+
+        if do_pad:
+            segmentation_map = self._pad(segmentation_map, size)
+
+        # Remove extra channel dimension if added for processing
+        if added_channel_dim:
+            segmentation_map = segmentation_map.squeeze(0)
+        return torch.from_numpy(segmentation_map)
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[Union[list[dict[int, int]], dict[int, int]]] = None,
+        instance_id_to_semantic_id: Optional[dict[int, int]] = None,
+        do_split_image: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        ignore_index: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Preprocesses images or a batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess.
+            segmentation_maps (`ImageInput`, *optional*):
+                The corresponding semantic segmentation maps with the pixel-wise annotations.
+            instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*):
+                A mapping between object instance ids and class ids.
+            do_split_image (`bool`, *optional*, defaults to `self.do_split_image`):
+                Whether to split the input images into overlapping patches for semantic segmentation.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the input images.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Target size as a dictionary with `"shortest_edge"` and `"longest_edge"` keys.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the input images by `rescale_factor`.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Factor to scale image pixel values.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the input images.
+            do_pad (`bool`, *optional*, defaults to `False`):
+                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Mean for normalization. Single value or list for each channel.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation for normalization. Single value or list for each channel.
+            ignore_index (`int`, *optional*):
+                Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+                denoted with 0 (background) will be replaced with `ignore_index`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be `"pt"`, `"tf"`, `"np"`, or `"jax"`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                Channel format of the output image. Either `"channels_first"` or `"channels_last"`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                Channel format of the input image.
+        """
+
+        do_split_image = do_split_image if do_split_image is not None else self.do_split_image
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        ignore_index = ignore_index if ignore_index is not None else self.ignore_index
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        pixel_values_list, patch_offsets = self._preprocess_images(
+            images=images,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_split_image=do_split_image,
+            do_pad=do_pad,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+
+        if segmentation_maps is not None:
+            segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
+            segmentation_maps = [to_numpy_array(mask) for mask in segmentation_maps]
+
+            segmentation_maps = [
+                self._preprocess_mask(
+                    segmentation_map,
+                    do_resize=do_resize,
+                    do_pad=do_pad,
+                    size=size,
+                    resample=PILImageResampling.NEAREST,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for segmentation_map in segmentation_maps
+            ]
+
+        encoded_inputs = self.encode_inputs(
+            pixel_values_list,
+            segmentation_maps,
+            instance_id_to_semantic_id,
+            ignore_index,
+            return_tensors,
+            input_data_format=data_format,
+        )
+
+        if do_split_image and patch_offsets:
+            encoded_inputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets]
+
+        return encoded_inputs
+
+    def encode_inputs(
+        self,
+        pixel_values_list: list[ImageInput],
+        segmentation_maps: ImageInput = None,
+        instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None,
+        ignore_index: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
+
+        EoMT addresses semantic segmentation with a mask classification paradigm, thus input segmentation maps
+        will be converted to lists of binary masks and their respective labels. Let's see an example, assuming
+        `segmentation_maps = [[2,6,7,9]]`, the output will contain `mask_labels =
+        [[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]` (four binary masks) and `class_labels = [2,6,7,9]`, the labels for
+        each mask.
+
+        Args:
+            pixel_values_list (`list[ImageInput]`):
+                list of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height,
+                width)`.
+
+            segmentation_maps (`ImageInput`, *optional*):
+                The corresponding semantic segmentation maps with the pixel-wise annotations.
+
+             (`bool`, *optional*, defaults to `True`):
+                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+                If left to the default, will return a pixel mask that is:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*):
+                A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an
+                instance segmentation map where each pixel represents an instance id. Can be provided as a single
+                dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map
+                instance ids in each image separately.
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor`
+                objects.
+
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model.
+            - **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model
+              (when `annotations` are provided).
+            - **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when
+              `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of
+              `mask_labels[i][j]` if `class_labels[i][j]`.
+        """
+        ignore_index = self.ignore_index if ignore_index is None else ignore_index
+
+        pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(pixel_values_list[0])
+
+        encoded_inputs = BatchFeature({"pixel_values": pixel_values_list}, tensor_type=return_tensors)
+
+        if segmentation_maps is not None:
+            mask_labels = []
+            class_labels = []
+            # Convert to list of binary masks and labels
+            for idx, segmentation_map in enumerate(segmentation_maps):
+                segmentation_map = to_numpy_array(segmentation_map)
+                if isinstance(instance_id_to_semantic_id, list):
+                    instance_id = instance_id_to_semantic_id[idx]
+                else:
+                    instance_id = instance_id_to_semantic_id
+                # Use instance2class_id mapping per image
+                masks, classes = convert_segmentation_map_to_binary_masks(
+                    segmentation_map,
+                    instance_id,
+                    ignore_index=ignore_index,
+                )
+
+                mask_labels.append(torch.from_numpy(masks))
+                class_labels.append(torch.from_numpy(classes))
+
+            # we cannot batch them since they don't share a common class size
+            encoded_inputs["mask_labels"] = mask_labels
+            encoded_inputs["class_labels"] = class_labels
+
+        return encoded_inputs
+
+    def merge_image_patches(
+        self,
+        segmentation_logits: torch.Tensor,
+        patch_offsets: list[tuple[int, int, int]],
+        target_sizes: list[tuple[int, int]],
+        size: dict[str, int],
+    ) -> list[torch.Tensor]:
+        """
+        Reconstructs full-size semantic segmentation logits from patch predictions.
+
+        Args:
+            segmentation_logits (`torch.Tensor`):
+                A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits
+                for each image patch.
+            patch_offsets (`list[tuple[int, int, int]]`):
+                A list of tuples where each tuple contains:
+                - `image_index` (int): Index of the original image this patch belongs to.
+                - `start` (int): Start pixel index of the patch along the long dimension (height or width).
+                - `end` (int): End pixel index of the patch along the long dimension.
+            target_sizes (`list[tuple[int, int]]`):
+                list of original (height, width) dimensions for each image before preprocessing.
+            size (`dict[str, int]`):
+                A size dict which was used to resize.
+        """
+        num_classes = segmentation_logits.shape[1]
+        aggregated_logits = []
+        patch_counts = []
+
+        for image_size in target_sizes:
+            height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"])
+            aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
+            patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
+
+        # Stitch patches back into full-sized logit maps
+        for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets):
+            if target_sizes[image_idx][0] > target_sizes[image_idx][1]:
+                aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx]
+                patch_counts[image_idx][:, patch_start:patch_end, :] += 1
+            else:
+                aggregated_logits[image_idx][:, :, patch_start:patch_end] += segmentation_logits[patch_idx]
+                patch_counts[image_idx][:, :, patch_start:patch_end] += 1
+
+        # Normalize and resize logits to original image size
+        reconstructed_logits = []
+        for idx, (logit_sum, count) in enumerate(zip(aggregated_logits, patch_counts)):
+            averaged_logits = logit_sum / count.clamp(min=1)
+            resized_logits = F.interpolate(
+                averaged_logits[None, ...],
+                size=target_sizes[idx],
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+
+            reconstructed_logits.append(resized_logits)
+
+        return reconstructed_logits
+
+    def unpad_image(
+        self,
+        segmentation_logits: torch.Tensor,
+        target_sizes: list[tuple[int, int]],
+        size: dict[str, int],
+    ) -> list[torch.Tensor]:
+        """Restores panoptic segmentation logits to their original image resolutions."""
+
+        resized_logits = []
+
+        for idx, original_size in enumerate(target_sizes):
+            target_height, target_width = get_size_with_aspect_ratio(
+                original_size, size["shortest_edge"], size["longest_edge"]
+            )
+            cropped_logits = segmentation_logits[idx][:, :target_height, :target_width]
+            upsampled_logits = F.interpolate(
+                cropped_logits[None, ...], size=original_size, mode="bilinear", align_corners=False
+            )[0]
+            resized_logits.append(upsampled_logits)
+        return resized_logits
+
+    def post_process_semantic_segmentation(
+        self,
+        outputs,
+        target_sizes: list[tuple[int, int]],
+        size: Optional[dict[str, int]] = None,
+    ) -> np.ndarray:
+        """Post-processes model outputs into final semantic segmentation prediction."""
+
+        size = size if size is not None else self.size
+
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+        patch_offsets = outputs.patch_offsets
+
+        output_size = get_target_size(size)
+        masks_queries_logits = F.interpolate(
+            masks_queries_logits,
+            size=output_size,
+            mode="bilinear",
+        )
+
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+
+        output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size)
+
+        preds = [logit.argmax(dim=0) for logit in output_logits]
+        return preds
+
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        target_sizes: list[tuple[int, int]],
+        threshold: float = 0.8,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        stuff_classes: Optional[list[int]] = None,
+        size: Optional[dict[str, int]] = None,
+    ):
+        """Post-processes model outputs into final panoptic segmentation prediction."""
+
+        size = size if size is not None else self.size
+
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        output_size = get_target_size(size)
+        masks_queries_logits = F.interpolate(
+            masks_queries_logits,
+            size=output_size,
+            mode="bilinear",
+        )
+
+        mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
+        pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1)
+
+        results: list = []
+
+        for i in range(batch_size):
+            mask_probs, pred_scores, pred_labels = remove_low_and_no_objects(
+                mask_probs_batch[i], pred_scores_batch[i], pred_labels_batch[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs,
+                pred_scores=pred_scores,
+                pred_labels=pred_labels,
+                stuff_classes=stuff_classes,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_size=target_sizes[i] if target_sizes is not None else None,
+            )
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+    @filter_out_non_signature_kwargs()
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        target_sizes: list[tuple[int, int]],
+        threshold: float = 0.5,
+        size: Optional[dict[str, int]] = None,
+    ):
+        """Post-processes model outputs into Instance Segmentation Predictions."""
+
+        size = size if size is not None else self.size
+
+        class_queries_logits = outputs.class_queries_logits
+        masks_queries_logits = outputs.masks_queries_logits
+
+        output_size = get_target_size(size)
+        masks_queries_logits = F.interpolate(
+            masks_queries_logits,
+            size=output_size,
+            mode="bilinear",
+        )
+
+        mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
+
+        device = masks_queries_logits.device
+        batch_size = class_queries_logits.shape[0]
+        num_queries = class_queries_logits.shape[-2]
+
+        results = []
+
+        for i in range(batch_size):
+            mask_pred = mask_probs_batch[i]
+            mask_class = class_queries_logits[i]
+
+            # Remove the null class `[..., :-1]`
+            scores, pred_classes = mask_class.softmax(dim=-1)[..., :-1].max(-1)
+            pred_masks = (mask_pred > 0).float()
+
+            # Calculate average mask prob
+            mask_scores = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / (
+                pred_masks.flatten(1).sum(1) + 1e-6
+            )
+            pred_scores = scores * mask_scores
+
+            segmentation = torch.zeros(target_sizes[i], device=device) - 1
+
+            instance_maps, segments = [], []
+            current_segment_id = 0
+            for j in range(num_queries):
+                score = pred_scores[j].item()
+
+                if not torch.all(pred_masks[j] == 0) and score >= threshold:
+                    segmentation[pred_masks[j] == 1] = current_segment_id
+                    segments.append(
+                        {
+                            "id": current_segment_id,
+                            "label_id": pred_classes[j].item(),
+                            "score": round(score, 6),
+                        }
+                    )
+                    current_segment_id += 1
+                    instance_maps.append(pred_masks[j])
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+
+__all__ = ["EomtImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/image_processing_eomt_fast.py b/phivenv/Lib/site-packages/transformers/models/eomt/image_processing_eomt_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..58457064412dcf181c5af7a79130a7a91ee2bfaf
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/eomt/image_processing_eomt_fast.py
@@ -0,0 +1,549 @@
+# coding=utf-8
+# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for EoMT."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+from .image_processing_eomt import (
+    compute_segments,
+    convert_segmentation_map_to_binary_masks,
+    get_size_with_aspect_ratio,
+    remove_low_and_no_objects,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs):
+    """
+    do_split_image (`bool`, *optional*, defaults to `False`):
+            Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
+            input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
+            Otherwise, the input images will be padded to the target size.
+    do_pad (`bool`, *optional*, defaults to `False`):
+            Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+            number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+    ignore_index (`int`, *optional*):
+            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+            denoted with 0 (background) will be replaced with `ignore_index`.
+    """
+
+    do_split_image: bool
+    do_pad: bool
+    ignore_index: Optional[int] = None
+
+
+def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
+    """Returns the height and width from a size dict."""
+    target_height = size_dict["shortest_edge"]
+    target_width = size_dict["longest_edge"] or target_height
+
+    return target_height, target_width
+
+
+def reorder_patches_and_offsets(
+    patches: list[torch.Tensor], offsets: list[list[int]]
+) -> tuple[list[torch.Tensor], list[list[int]]]:
+    """Sorts patches and offsets according to the original image index."""
+
+    combined = list(zip(offsets, patches))
+    combined.sort(key=lambda x: x[0][0])
+    sorted_offsets, sorted_patches = zip(*combined)
+
+    return list(sorted_patches), list(sorted_offsets)
+
+
+@auto_docstring
+class EomtImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    size = {"shortest_edge": 640, "longest_edge": 640}
+    default_to_square = False
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_split_image = False
+    do_pad = False
+    ignore_index = None
+    valid_kwargs = EomtImageProcessorFastKwargs
+
+    def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]):
+        super().__init__(**kwargs)
+
+    def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]:
+        """Slices an image into overlapping patches for semantic segmentation."""
+
+        patches, patch_offsets = [], []
+
+        _, _, height, width = images.shape
+        patch_size = size["shortest_edge"]
+
+        longer_side = max(height, width)
+        num_patches = math.ceil(longer_side / patch_size)
+        total_overlap = num_patches * patch_size - longer_side
+        overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0
+
+        for i in range(num_patches):
+            start = int(i * (patch_size - overlap_per_patch))
+            end = start + patch_size
+
+            if height > width:
+                batch_patch = images[:, :, start:end, :]
+            else:
+                batch_patch = images[:, :, :, start:end]
+
+            for batch_idx, single in enumerate(torch.unbind(batch_patch, dim=0)):
+                patches.append(single)
+                patch_offsets.append([image_indices[batch_idx], start, end])
+
+        return patches, patch_offsets
+
+    def _pad(self, images: torch.Tensor, size: dict) -> torch.Tensor:
+        """Pads the image to the target size using zero padding."""
+        _, _, height, width = images.shape
+
+        target_height, target_width = get_target_size(size)
+        pad_h = max(0, target_height - height)
+        pad_w = max(0, target_width - width)
+        padding = (0, pad_w, 0, pad_h)
+
+        padded_images = torch.nn.functional.pad(images, padding, mode="constant", value=0.0)
+        return padded_images
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[list[torch.Tensor]] = None,
+        instance_id_to_semantic_id: Optional[dict[int, int]] = None,
+        **kwargs: Unpack[EomtImageProcessorFastKwargs],
+    ) -> BatchFeature:
+        r"""
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to preprocess for corresponding images.
+        instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*):
+            A mapping between object instance ids and class ids.
+        """
+        return super().preprocess(images, segmentation_maps, instance_id_to_semantic_id, **kwargs)
+
+    def _preprocess_image_like_inputs(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput],
+        instance_id_to_semantic_id: Optional[dict[int, int]],
+        do_convert_rgb: bool,
+        input_data_format: ChannelDimension,
+        device: Optional[Union[str, "torch.device"]] = None,
+        **kwargs: Unpack[EomtImageProcessorFastKwargs],
+    ) -> BatchFeature:
+        """
+        Preprocess image-like inputs.
+        """
+        images = self._prepare_image_like_inputs(
+            images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+        )
+        ignore_index = kwargs.pop("ignore_index", None)
+        images_kwargs = kwargs.copy()
+        processed_images, patch_offsets = self._preprocess(images, **images_kwargs)
+        outputs = BatchFeature({"pixel_values": processed_images})
+
+        if segmentation_maps is not None:
+            processed_segmentation_maps = self._prepare_image_like_inputs(
+                images=segmentation_maps,
+                expected_ndims=2,
+                do_convert_rgb=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+
+            segmentation_maps_kwargs = kwargs.copy()
+            segmentation_maps_kwargs.update(
+                {
+                    "do_normalize": False,
+                    "do_rescale": False,
+                    # Nearest interpolation is used for segmentation maps instead of BILINEAR.
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT
+                    if is_torchvision_v2_available()
+                    else F.InterpolationMode.NEAREST,
+                }
+            )
+
+            processed_segmentation_maps, _ = self._preprocess(
+                images=processed_segmentation_maps, **segmentation_maps_kwargs
+            )
+            processed_segmentation_maps = processed_segmentation_maps.squeeze(1).to(torch.int64)
+            # Convert to list of binary masks and labels
+            mask_labels, class_labels = [], []
+            for idx, segmentation_map in enumerate(processed_segmentation_maps):
+                if isinstance(instance_id_to_semantic_id, list):
+                    instance_id = instance_id_to_semantic_id[idx]
+                else:
+                    instance_id = instance_id_to_semantic_id
+                # Use instance2class_id mapping per image
+                masks, classes = convert_segmentation_map_to_binary_masks(
+                    segmentation_map,
+                    instance_id,
+                    ignore_index=ignore_index,
+                )
+
+                mask_labels.append(torch.from_numpy(masks))
+                class_labels.append(torch.from_numpy(classes))
+
+            # we cannot batch them since they don't share a common class size
+            outputs["mask_labels"] = mask_labels
+            outputs["class_labels"] = class_labels
+
+        if patch_offsets:
+            outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets]
+
+        return outputs
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_split_image: bool,
+        do_pad: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ):
+        """Preprocesses the input images and masks if provided."""
+        processed_images, patch_offsets = [], []
+
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+                resized_images_grouped[shape] = stacked_images
+        images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for batched resizing, Needed in case do_resize is False.
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+
+        for shape, stacked_images in grouped_images.items():
+            original_indices = [
+                original_idx for original_idx, (img_shape, _) in grouped_images_index.items() if img_shape == shape
+            ]
+
+            if do_split_image:
+                patches, offsets = self._split_image(stacked_images, size, original_indices)
+                processed_images.extend(patches)
+                patch_offsets.extend(offsets)
+
+            if do_pad:
+                stacked_images = self._pad(stacked_images, size)
+                processed_images_grouped[shape] = stacked_images
+
+        if do_split_image:
+            images, patch_offsets = reorder_patches_and_offsets(processed_images, patch_offsets)
+
+        if do_pad:
+            images = reorder_images(processed_images_grouped, grouped_images_index)
+
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+
+        for shape, stacked_images in grouped_images.items():
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        images = reorder_images(processed_images_grouped, grouped_images_index)
+
+        processed_images = torch.stack(images, dim=0) if return_tensors else images
+
+        return processed_images, patch_offsets
+
+    def merge_image_patches(
+        self,
+        segmentation_logits: torch.Tensor,
+        patch_offsets: list[tuple[int, int, int]],
+        target_sizes: list[tuple[int, int]],
+        size: dict[str, int],
+    ) -> list[torch.Tensor]:
+        """
+        Reconstructs full-size semantic segmentation logits from patch predictions.
+
+        Args:
+            segmentation_logits (`torch.Tensor`):
+                A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits
+                for each image patch.
+            patch_offsets (`list[tuple[int, int, int]]`):
+                A list of tuples where each tuple contains:
+                - `image_index` (int): Index of the original image this patch belongs to.
+                - `start` (int): Start pixel index of the patch along the long dimension (height or width).
+                - `end` (int): End pixel index of the patch along the long dimension.
+            target_sizes (`list[tuple[int, int]]`):
+                list of original (height, width) dimensions for each image before preprocessing.
+            size (`dict[str, int]`):
+                A size dict which was used to resize.
+        """
+        num_classes = segmentation_logits.shape[1]
+        aggregated_logits = []
+        patch_counts = []
+
+        for image_size in target_sizes:
+            height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"])
+            aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
+            patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
+
+        # Stitch patches back into full-sized logit maps
+        for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets):
+            if target_sizes[image_idx][0] > target_sizes[image_idx][1]:
+                aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx]
+                patch_counts[image_idx][:, patch_start:patch_end, :] += 1
+            else:
+                aggregated_logits[image_idx][:, :, patch_start:patch_end] += segmentation_logits[patch_idx]
+                patch_counts[image_idx][:, :, patch_start:patch_end] += 1
+
+        # Normalize and resize logits to original image size
+        reconstructed_logits = []
+        for idx, (logit_sum, count) in enumerate(zip(aggregated_logits, patch_counts)):
+            averaged_logits = logit_sum / count.clamp(min=1)
+            resized_logits = torch.nn.functional.interpolate(
+                averaged_logits[None, ...],
+                size=target_sizes[idx],
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+
+            reconstructed_logits.append(resized_logits)
+
+        return reconstructed_logits
+
+    def unpad_image(
+        self,
+        segmentation_logits: torch.Tensor,
+        target_sizes: list[tuple[int, int]],
+        size: dict[str, int],
+    ) -> list[torch.Tensor]:
+        """Restores panoptic segmentation logits to their original image resolutions."""
+
+        resized_logits = []
+
+        for idx, original_size in enumerate(target_sizes):
+            target_height, target_width = get_size_with_aspect_ratio(
+                original_size, size["shortest_edge"], size["longest_edge"]
+            )
+            cropped_logits = segmentation_logits[idx][:, :target_height, :target_width]
+            upsampled_logits = torch.nn.functional.interpolate(
+                cropped_logits[None, ...], size=original_size, mode="bilinear", align_corners=False
+            )[0]
+            resized_logits.append(upsampled_logits)
+        return resized_logits
+
+    def post_process_semantic_segmentation(
+        self,
+        outputs,
+        target_sizes: list[tuple[int, int]],
+        size: Optional[dict[str, int]] = None,
+    ) -> np.ndarray:
+        """Post-processes model outputs into final semantic segmentation prediction."""
+
+        size = size if size is not None else self.size
+
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+        patch_offsets = outputs.patch_offsets
+
+        output_size = get_target_size(size)
+        masks_queries_logits = torch.nn.functional.interpolate(
+            masks_queries_logits,
+            size=output_size,
+            mode="bilinear",
+        )
+
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+
+        output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size)
+
+        preds = [logit.argmax(dim=0) for logit in output_logits]
+        return preds
+
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        target_sizes: list[tuple[int, int]],
+        threshold: float = 0.8,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        stuff_classes: Optional[list[int]] = None,
+        size: Optional[dict[str, int]] = None,
+    ):
+        """Post-processes model outputs into final panoptic segmentation prediction."""
+
+        size = size if size is not None else self.size
+
+        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
+        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        output_size = get_target_size(size)
+        masks_queries_logits = torch.nn.functional.interpolate(
+            masks_queries_logits,
+            size=output_size,
+            mode="bilinear",
+        )
+
+        mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
+        pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1)
+
+        results: list = []
+
+        for i in range(batch_size):
+            mask_probs, pred_scores, pred_labels = remove_low_and_no_objects(
+                mask_probs_batch[i], pred_scores_batch[i], pred_labels_batch[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs,
+                pred_scores=pred_scores,
+                pred_labels=pred_labels,
+                stuff_classes=stuff_classes,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_size=target_sizes[i] if target_sizes is not None else None,
+            )
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+    @filter_out_non_signature_kwargs()
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        target_sizes: list[tuple[int, int]],
+        threshold: float = 0.8,
+        size: Optional[dict[str, int]] = None,
+    ):
+        """Post-processes model outputs into Instance Segmentation Predictions."""
+
+        size = size if size is not None else self.size
+
+        masks_queries_logits = outputs.masks_queries_logits
+        class_queries_logits = outputs.class_queries_logits
+
+        output_size = get_target_size(size)
+        masks_queries_logits = torch.nn.functional.interpolate(
+            masks_queries_logits,
+            size=output_size,
+            mode="bilinear",
+        )
+
+        mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
+
+        device = masks_queries_logits.device
+        batch_size = class_queries_logits.shape[0]
+        num_queries = class_queries_logits.shape[-2]
+
+        results = []
+
+        for i in range(batch_size):
+            mask_pred = mask_probs_batch[i]
+            mask_class = class_queries_logits[i]
+
+            # Remove the null class `[..., :-1]`
+            scores, pred_classes = mask_class.softmax(dim=-1)[..., :-1].max(-1)
+            pred_masks = (mask_pred > 0).float()
+
+            # Calculate average mask prob
+            mask_scores = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / (
+                pred_masks.flatten(1).sum(1) + 1e-6
+            )
+            pred_scores = scores * mask_scores
+
+            segmentation = torch.zeros(target_sizes[i], device=device) - 1
+
+            instance_maps, segments = [], []
+            current_segment_id = 0
+            for j in range(num_queries):
+                score = pred_scores[j].item()
+
+                if not torch.all(pred_masks[j] == 0) and score >= threshold:
+                    segmentation[pred_masks[j] == 1] = current_segment_id
+                    segments.append(
+                        {
+                            "id": current_segment_id,
+                            "label_id": pred_classes[j].item(),
+                            "score": round(score, 6),
+                        }
+                    )
+                    current_segment_id += 1
+                    instance_maps.append(pred_masks[j])
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+
+__all__ = ["EomtImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/modeling_eomt.py b/phivenv/Lib/site-packages/transformers/models/eomt/modeling_eomt.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff04a55614e636baea941d900ba1a6141486bd3e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/eomt/modeling_eomt.py
@@ -0,0 +1,1227 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/eomt/modular_eomt.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_eomt.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...file_utils import ModelOutput, is_scipy_available, requires_backends
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, is_accelerate_available
+from ...utils.generic import check_model_inputs
+from .configuration_eomt import EomtConfig
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`EomtForUniversalSegmentationOutput`].
+
+    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
+    [`~EomtImageProcessor.post_process_instance_segmentation`] or
+    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
+    [`~EomtImageProcessor] for details regarding usage.
+    """
+)
+class EomtForUniversalSegmentationOutput(ModelOutput):
+    r"""
+    loss (`torch.Tensor`, *optional*):
+        The computed loss, returned when labels are present.
+    class_queries_logits (`torch.FloatTensor`):
+        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
+        query. Note the `+ 1` is needed because we incorporate the null class.
+    masks_queries_logits (`torch.FloatTensor`):
+        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
+        query.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        Last hidden states (final feature map) of the last layer.
+    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
+    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
+    patch_offsets (`list[torch.Tensor]`, *optional*):
+        list of tuples indicating the image index and start and end positions of patches for semantic segementation.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    class_queries_logits: Optional[torch.FloatTensor] = None
+    masks_queries_logits: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    patch_offsets: Optional[list[torch.Tensor]] = None
+
+
+# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py
+def sample_point(
+    input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs
+) -> torch.Tensor:
+    """
+    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.
+
+    Args:
+        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
+            A tensor that contains features map on a height * width grid
+        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
+        2)):
+            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
+        add_dim (`bool`):
+            boolean value to keep track of added dimension
+
+    Returns:
+        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
+        height_grid, width_grid):
+            A tensor that contains features for points in `point_coordinates`.
+    """
+    if point_coordinates.dim() == 3:
+        add_dim = True
+        point_coordinates = point_coordinates.unsqueeze(2)
+
+    # use nn.function.grid_sample to get features for points in `point_coordinates` via bilinear interpolation
+    point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs)
+    if add_dim:
+        point_features = point_features.squeeze(3)
+
+    return point_features
+
+
+def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
+    """
+    A pair wise version of the dice loss, see `dice_loss` for usage.
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        `torch.Tensor`: The computed loss between each pairs.
+    """
+    inputs = inputs.sigmoid().flatten(1)
+    numerator = 2 * torch.matmul(inputs, labels.T)
+    # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
+    denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+
+
+def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    r"""
+    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        loss (`torch.Tensor`): The computed loss between each pairs.
+    """
+
+    height_and_width = inputs.shape[1]
+
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
+    cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))
+
+    loss_pos = torch.matmul(cross_entropy_loss_pos / height_and_width, labels.T)
+    loss_neg = torch.matmul(cross_entropy_loss_neg / height_and_width, (1 - labels).T)
+    loss = loss_pos + loss_neg
+    return loss
+
+
+# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/matcher.py
+class EomtHungarianMatcher(nn.Module):
+    """This class computes an assignment between the labels and the predictions of the network.
+
+    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
+    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(
+        self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544
+    ):
+        """Creates the matcher
+
+        Params:
+            cost_class (`float`, *optional*, defaults to 1.0):
+                Relative weight of the classification error in the matching cost.
+            cost_mask (`float`, *optional*,  defaults to 1.0):
+                This is the relative weight of the focal loss of the binary mask in the matching cost.
+            cost_dice (`float`, *optional*, defaults to 1.0):
+                This is the relative weight of the dice loss of the binary mask in the matching cost.
+            num_points (`int`, *optional*, defaults to 12544):
+                No. of points to sample on which the mask loss will be calculated. The same set of K points are
+                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
+                matching.
+        """
+        super().__init__()
+        if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
+            raise ValueError("All costs can't be 0")
+
+        self.num_points = num_points
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+
+    @torch.no_grad()
+    def forward(
+        self,
+        masks_queries_logits: torch.Tensor,
+        class_queries_logits: torch.Tensor,
+        mask_labels: torch.Tensor,
+        class_labels: torch.Tensor,
+    ) -> list[tuple[Tensor]]:
+        """
+        Params:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
+            class_queries_logits (`torch.Tensor`):
+                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
+            class_labels (`torch.Tensor`):
+                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
+                target) containing the class labels.
+            mask_labels (`torch.Tensor`):
+                A tensor of dim `num_target_boxes, height, width` containing the target masks.
+
+        Returns:
+            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
+            where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected labels (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
+        """
+        indices: list[tuple[np.array]] = []
+
+        # iterate through batch size
+        batch_size = masks_queries_logits.shape[0]
+        for i in range(batch_size):
+            pred_probs = class_queries_logits[i].softmax(-1)
+            pred_mask = masks_queries_logits[i]
+
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be omitted.
+            cost_class = -pred_probs[:, class_labels[i]]
+            target_mask = mask_labels[i].to(pred_mask)
+            target_mask = target_mask[:, None]
+            pred_mask = pred_mask[:, None]
+
+            # Sample ground truth and predicted masks
+            point_coordinates = torch.rand(1, self.num_points, 2, device=pred_mask.device)
+
+            target_coordinates = point_coordinates.repeat(target_mask.shape[0], 1, 1)
+            target_mask = sample_point(target_mask, target_coordinates, align_corners=False).squeeze(1)
+
+            pred_coordinates = point_coordinates.repeat(pred_mask.shape[0], 1, 1)
+            pred_mask = sample_point(pred_mask, pred_coordinates, align_corners=False).squeeze(1)
+
+            # compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels)
+            cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask)
+            # Compute the dice loss between each mask pairs -> shape (num_queries, num_labels)
+            cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
+            # final cost matrix
+            cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
+            # eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible``
+            cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10))
+            cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10))
+            cost_matrix = torch.nan_to_num(cost_matrix, 0)
+            # do the assignment using the hungarian algorithm in scipy
+            assigned_indices: tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
+            indices.append(assigned_indices)
+
+        # It could be stacked in one tensor
+        matched_indices = [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices
+        ]
+        return matched_indices
+
+
+def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor:
+    r"""
+    Compute the DICE loss, similar to generalized IOU for masks as follows:
+
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$
+
+    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow
+
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$
+
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+        num_masks (`int`):
+            The number of masks present in the current batch, used for normalization.
+
+    Returns:
+        `torch.Tensor`: The computed loss.
+    """
+    probs = inputs.sigmoid().flatten(1)
+    numerator = 2 * (probs * labels).sum(-1)
+    denominator = probs.sum(-1) + labels.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    loss = loss.sum() / num_masks
+    return loss
+
+
+def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor:
+    r"""
+    Args:
+        inputs (`torch.Tensor`):
+            A float tensor of arbitrary shape.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+
+    Returns:
+        loss (`torch.Tensor`): The computed loss.
+    """
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss = criterion(inputs, labels)
+
+    loss = cross_entropy_loss.mean(1).sum() / num_masks
+    return loss
+
+
+# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/criterion.py
+class EomtLoss(nn.Module):
+    def __init__(self, config: EomtConfig, weight_dict: dict[str, float]):
+        """
+        The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
+        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
+        of matched ground-truth / prediction (supervise class and mask)
+
+        Args:
+            config (`EomtConfig`):
+                The configuration for Eomt model also containing loss calculation specific parameters.
+            weight_dict (`dict[str, float]`):
+                A dictionary of weights to be applied to the different losses.
+        """
+        super().__init__()
+        requires_backends(self, ["scipy"])
+        self.num_labels = config.num_labels
+        self.weight_dict = weight_dict
+
+        # Weight to apply to the null class
+        self.eos_coef = config.no_object_weight
+        empty_weight = torch.ones(self.num_labels + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+
+        # pointwise mask loss parameters
+        self.num_points = config.train_num_points
+        self.oversample_ratio = config.oversample_ratio
+        self.importance_sample_ratio = config.importance_sample_ratio
+
+        self.matcher = EomtHungarianMatcher(
+            cost_class=config.class_weight,
+            cost_dice=config.dice_weight,
+            cost_mask=config.mask_weight,
+            num_points=self.num_points,
+        )
+
+    def _max_by_axis(self, sizes: list[list[int]]) -> list[int]:
+        maxes = sizes[0]
+        for sublist in sizes[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+
+    # Adapted from nested_tensor_from_tensor_list() in original implementation
+    def _pad_images_to_max_in_batch(self, tensors: list[Tensor]) -> tuple[Tensor, Tensor]:
+        # get the maximum size in the batch
+        max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors])
+        # compute final size
+        batch_shape = [len(tensors)] + max_size
+        batch_size, _, height, width = batch_shape
+        dtype = tensors[0].dtype
+        device = tensors[0].device
+        padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device)
+        padding_masks = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        # pad the tensors to the size of the biggest one
+        for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks):
+            padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor)
+            padding_mask[: tensor.shape[1], : tensor.shape[2]] = False
+
+        return padded_tensors, padding_masks
+
+    def loss_labels(
+        self, class_queries_logits: Tensor, class_labels: list[Tensor], indices: tuple[np.array]
+    ) -> dict[str, Tensor]:
+        """Compute the losses related to the labels using cross entropy.
+
+        Args:
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, num_labels`
+            class_labels (`list[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            indices (`tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+
+        Returns:
+            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+        """
+        pred_logits = class_queries_logits
+        batch_size, num_queries, _ = pred_logits.shape
+        criterion = nn.CrossEntropyLoss(weight=self.empty_weight)
+        idx = self._get_predictions_permutation_indices(indices)  # shape of (batch_size, num_queries)
+        target_classes_o = torch.cat(
+            [target[j] for target, (_, j) in zip(class_labels, indices)]
+        )  # shape of (batch_size, num_queries)
+        target_classes = torch.full(
+            (batch_size, num_queries), fill_value=self.num_labels, dtype=torch.int64, device=pred_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        # Permute target_classes (batch_size, num_queries, num_labels) -> (batch_size, num_labels, num_queries)
+        pred_logits_transposed = pred_logits.transpose(1, 2)
+        loss_ce = criterion(pred_logits_transposed, target_classes)
+        losses = {"loss_cross_entropy": loss_ce}
+        return losses
+
+    def loss_masks(
+        self,
+        masks_queries_logits: torch.Tensor,
+        mask_labels: list[torch.Tensor],
+        indices: tuple[np.array],
+        num_masks: int,
+    ) -> dict[str, torch.Tensor]:
+        """Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.
+
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, height, width)`.
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            indices (`tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+            num_masks (`int)`:
+                The number of masks, used for normalization.
+
+        Returns:
+            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
+            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
+              masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
+              masks.
+        """
+        src_idx = self._get_predictions_permutation_indices(indices)
+        tgt_idx = self._get_targets_permutation_indices(indices)
+        # shape (batch_size * num_queries, height, width)
+        pred_masks = masks_queries_logits[src_idx]
+        # shape (batch_size, num_queries, height, width)
+        # pad all and stack the targets to the num_labels dimension
+        target_masks, _ = self._pad_images_to_max_in_batch(mask_labels)
+        target_masks = target_masks[tgt_idx]
+
+        # No need to upsample predictions as we are using normalized coordinates
+        pred_masks = pred_masks[:, None]
+        target_masks = target_masks[:, None]
+
+        # Sample point coordinates
+        with torch.no_grad():
+            point_coordinates = self.sample_points_using_uncertainty(
+                pred_masks,
+                lambda logits: self.calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+
+            point_labels = sample_point(target_masks, point_coordinates, align_corners=False).squeeze(1)
+
+        point_logits = sample_point(pred_masks, point_coordinates, align_corners=False).squeeze(1)
+
+        losses = {
+            "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks),
+            "loss_dice": dice_loss(point_logits, point_labels, num_masks),
+        }
+
+        del pred_masks
+        del target_masks
+        return losses
+
+    def _get_predictions_permutation_indices(self, indices):
+        # Permute predictions following indices
+        batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        predictions_indices = torch.cat([src for (src, _) in indices])
+        return batch_indices, predictions_indices
+
+    def _get_targets_permutation_indices(self, indices):
+        # Permute labels following indices
+        batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        target_indices = torch.cat([tgt for (_, tgt) in indices])
+        return batch_indices, target_indices
+
+    def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
+        for the foreground class in `classes`.
+
+        Args:
+            logits (`torch.Tensor`):
+            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
+            the number of foreground classes. The values are logits.
+
+        Returns:
+            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
+            uncertain locations having the highest uncertainty score.
+        """
+        uncertainty_scores = -(torch.abs(logits))
+        return uncertainty_scores
+
+    def sample_points_using_uncertainty(
+        self,
+        logits: torch.Tensor,
+        uncertainty_function,
+        num_points: int,
+        oversample_ratio: int,
+        importance_sample_ratio: float,
+    ) -> torch.Tensor:
+        """
+        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
+        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
+        prediction as input.
+
+        Args:
+            logits (`float`):
+                Logit predictions for P points.
+            uncertainty_function:
+                A function that takes logit predictions for P points and returns their uncertainties.
+            num_points (`int`):
+                The number of points P to sample.
+            oversample_ratio (`int`):
+                Oversampling parameter.
+            importance_sample_ratio (`float`):
+                Ratio of points that are sampled via importance sampling.
+
+        Returns:
+            point_coordinates (`torch.Tensor`):
+                Coordinates for P sampled points.
+        """
+
+        num_boxes = logits.shape[0]
+        num_points_sampled = int(num_points * oversample_ratio)
+
+        # Get random point coordinates
+        point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device)
+        # Get sampled prediction value for the point coordinates
+        point_logits = sample_point(logits, point_coordinates, align_corners=False)
+        # Calculate the uncertainties based on the sampled prediction values of the points
+        point_uncertainties = uncertainty_function(point_logits)
+
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+
+        idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device)
+        idx += shift[:, None]
+        point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2)
+
+        if num_random_points > 0:
+            point_coordinates = torch.cat(
+                [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)],
+                dim=1,
+            )
+        return point_coordinates
+
+    def forward(
+        self,
+        masks_queries_logits: torch.Tensor,
+        class_queries_logits: torch.Tensor,
+        mask_labels: list[torch.Tensor],
+        class_labels: list[torch.Tensor],
+        auxiliary_predictions: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
+        """
+        This performs the loss computation.
+
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, height, width)`.
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, num_labels)`.
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            class_labels (`list[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
+                if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
+                the inner layers of the EomtMaskedAttentionDecoder.
+
+        Returns:
+            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
+              masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
+              masks.
+            if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
+            losses for each auxiliary predictions.
+        """
+
+        # retrieve the matching between the outputs of the last layer and the labels
+        indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
+        # compute the average number of target masks for normalization purposes
+        num_masks = self.get_num_masks(class_labels, device=class_labels[0].device)
+        # get all the losses
+        losses: dict[str, Tensor] = {
+            **self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks),
+            **self.loss_labels(class_queries_logits, class_labels, indices),
+        }
+        # in case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if auxiliary_predictions is not None:
+            for idx, aux_outputs in enumerate(auxiliary_predictions):
+                masks_queries_logits = aux_outputs["masks_queries_logits"]
+                class_queries_logits = aux_outputs["class_queries_logits"]
+                loss_dict = self.forward(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
+                loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()}
+                losses.update(loss_dict)
+
+        return losses
+
+    def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
+        """
+        Computes the average number of target masks across the batch, for normalization purposes.
+        """
+        num_masks = sum([len(classes) for classes in class_labels])
+        num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device)
+        world_size = 1
+        if is_accelerate_available():
+            if PartialState._shared_state != {}:
+                num_masks = reduce(num_masks)
+                world_size = PartialState().num_processes
+
+        num_masks = torch.clamp(num_masks / world_size, min=1)
+        return num_masks
+
+
+class EomtPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class EomtEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+
+    def __init__(self, config: EomtConfig) -> None:
+        super().__init__()
+
+        self.config = config
+        self.patch_size = config.patch_size
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+
+        self.patch_embeddings = EomtPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.num_prefix_tokens = 1 + config.num_register_tokens  # 1 for [CLS]
+        self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+        self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, _, _, _ = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        register_tokens = self.register_tokens.expand(batch_size, -1, -1)
+
+        embeddings = embeddings + self.position_embeddings(self.position_ids)
+        embeddings = torch.cat([cls_tokens, register_tokens, embeddings], dim=1)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class EomtAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class EomtLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class EomtDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class EomtMLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class EomtSwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class EomtLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: EomtConfig) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = EomtAttention(config)
+        self.layer_scale1 = EomtLayerScale(config)
+        self.drop_path = EomtDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = EomtSwiGLUFFN(config)
+        else:
+            self.mlp = EomtMLP(config)
+        self.layer_scale2 = EomtLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states_norm = self.norm1(hidden_states)
+        self_attention_output, _ = self.attention(hidden_states_norm, head_mask)
+        self_attention_output = self.layer_scale1(self_attention_output)
+
+        # first residual connection
+        hidden_states = self.drop_path(self_attention_output) + hidden_states
+
+        # in Eomt, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        return layer_output
+
+
+class EomtLayerNorm2d(nn.LayerNorm):
+    def __init__(self, num_channels, eps=1e-6, affine=True):
+        super().__init__(num_channels, eps=eps, elementwise_affine=affine)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = hidden_state.permute(0, 2, 3, 1)
+        hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps)
+        hidden_state = hidden_state.permute(0, 3, 1, 2)
+        return hidden_state
+
+
+class EomtScaleLayer(nn.Module):
+    def __init__(self, config: EomtConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2)
+        self.activation = ACT2FN[config.hidden_act]
+        self.conv2 = nn.Conv2d(
+            hidden_size,
+            hidden_size,
+            kernel_size=3,
+            padding=1,
+            groups=hidden_size,
+            bias=False,
+        )
+
+        self.layernorm2d = EomtLayerNorm2d(hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layernorm2d(hidden_states)
+        return hidden_states
+
+
+class EomtScaleBlock(nn.Module):
+    def __init__(self, config: EomtConfig):
+        super().__init__()
+        self.num_blocks = config.num_upscale_blocks
+        self.block = nn.ModuleList([EomtScaleLayer(config) for _ in range(self.num_blocks)])
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for block in self.block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class EomtMaskHead(nn.Module):
+    def __init__(self, config: EomtConfig):
+        super().__init__()
+
+        hidden_size = config.hidden_size
+        self.fc1 = nn.Linear(hidden_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, hidden_size)
+        self.fc3 = nn.Linear(hidden_size, hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.activation(self.fc1(hidden_states))
+        hidden_states = self.activation(self.fc2(hidden_states))
+        hidden_states = self.fc3(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class EomtPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: EomtConfig
+    base_model_prefix = "eomt"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["EomtLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _can_record_outputs = {
+        "hidden_states": EomtLayer,
+        "attentions": EomtAttention,
+    }
+
+    def _init_weights(self, module: nn.Module) -> None:
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=1)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, EomtLayerScale):
+            if hasattr(module, "lambda1"):
+                module.lambda1.data.fill_(self.config.layerscale_value)
+        elif isinstance(module, EomtEmbeddings):
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32), mean=0.0, std=std
+            ).to(module.cls_token.dtype)
+            module.register_tokens.data.zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
+    """
+)
+class EomtForUniversalSegmentation(EomtPreTrainedModel):
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: EomtConfig):
+        super().__init__(config)
+        self.config = config
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embeddings = EomtEmbeddings(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.query = nn.Embedding(config.num_queries, config.hidden_size)
+        self.layers = nn.ModuleList([EomtLayer(config) for _ in range(config.num_hidden_layers)])
+
+        self.upscale_block = EomtScaleBlock(config)
+        self.mask_head = EomtMaskHead(config)
+
+        self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1)
+
+        self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size)
+        self.weight_dict: dict[str, float] = {
+            "loss_cross_entropy": config.class_weight,
+            "loss_mask": config.mask_weight,
+            "loss_dice": config.dice_weight,
+        }
+
+        self.criterion = EomtLoss(config=config, weight_dict=self.weight_dict)
+
+        self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks))
+
+        self.post_init()
+
+    def get_loss_dict(
+        self,
+        masks_queries_logits: Tensor,
+        class_queries_logits: Tensor,
+        mask_labels: Tensor,
+        class_labels: Tensor,
+        auxiliary_predictions: dict[str, Tensor],
+    ) -> dict[str, Tensor]:
+        loss_dict: dict[str, Tensor] = self.criterion(
+            masks_queries_logits=masks_queries_logits,
+            class_queries_logits=class_queries_logits,
+            mask_labels=mask_labels,
+            class_labels=class_labels,
+            auxiliary_predictions=auxiliary_predictions,
+        )
+
+        # weight each loss by `self.weight_dict[<LOSS_NAME>]` including auxiliary losses
+        for key, weight in self.weight_dict.items():
+            for loss_key, loss in loss_dict.items():
+                if key in loss_key:
+                    loss *= weight
+
+        return loss_dict
+
+    def get_loss(self, loss_dict: dict[str, Tensor]) -> Tensor:
+        return sum(loss_dict.values())
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Tensor,
+        mask_labels: Optional[list[Tensor]] = None,
+        class_labels: Optional[list[Tensor]] = None,
+        patch_offsets: Optional[list[Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> EomtForUniversalSegmentationOutput:
+        r"""
+        mask_labels (`list[torch.Tensor]`, *optional*):
+            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
+        class_labels (`list[torch.LongTensor]`, *optional*):
+            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
+            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
+        patch_offsets (`list[torch.Tensor]`, *optional*):
+            list of tuples indicating the image index and start and end positions of patches for semantic segementation.
+        """
+
+        masks_queries_logits_per_layer, class_queries_logits_per_layer = (), ()
+        attention_mask = None
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        for idx, layer_module in enumerate(self.layers):
+            if idx == self.num_hidden_layers - self.config.num_blocks:
+                query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device)
+                hidden_states = torch.cat((query, hidden_states), dim=1)
+
+            if idx >= self.num_hidden_layers - self.config.num_blocks and (
+                self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0
+            ):
+                norm_hidden_states = self.layernorm(hidden_states)
+                masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states)
+
+                masks_queries_logits_per_layer += (masks_queries_logits,)
+                class_queries_logits_per_layer += (class_queries_logits,)
+
+                attention_mask = torch.ones(
+                    hidden_states.shape[0],
+                    hidden_states.shape[1],
+                    hidden_states.shape[1],
+                    device=hidden_states.device,
+                    dtype=torch.bool,
+                )
+
+                interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear")
+                interpolated_logits = interpolated_logits.view(
+                    interpolated_logits.size(0), interpolated_logits.size(1), -1
+                )
+
+                num_query_tokens = self.config.num_queries
+                encoder_start_tokens = num_query_tokens + self.embeddings.num_prefix_tokens
+
+                # Set attention mask for queries to focus on encoder tokens based on interpolated logits
+                attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0
+
+                # Disable attention mask for random query tokens.
+                attention_mask = self._disable_attention_mask(
+                    attention_mask,
+                    prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks],
+                    num_query_tokens=num_query_tokens,
+                    encoder_start_tokens=encoder_start_tokens,
+                    device=attention_mask.device,
+                )
+
+                # Expand attention mask to 4d mask.
+                attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1)
+                attention_mask = attention_mask.float().masked_fill(~attention_mask, -1e9)
+
+            hidden_states = layer_module(hidden_states, attention_mask)
+
+        sequence_output = self.layernorm(hidden_states)
+
+        masks_queries_logits, class_queries_logits = self.predict(sequence_output)
+        masks_queries_logits_per_layer += (masks_queries_logits,)
+        class_queries_logits_per_layer += (class_queries_logits,)
+
+        loss = None
+        if mask_labels is not None and class_labels is not None:
+            loss = 0.0
+            for masks_queries_logits, class_queries_logits in zip(
+                masks_queries_logits_per_layer, class_queries_logits_per_layer
+            ):
+                loss_dict = self.get_loss_dict(
+                    masks_queries_logits=masks_queries_logits,
+                    class_queries_logits=class_queries_logits,
+                    mask_labels=mask_labels,
+                    class_labels=class_labels,
+                    auxiliary_predictions=None,
+                )
+                loss += self.get_loss(loss_dict)
+
+        return EomtForUniversalSegmentationOutput(
+            loss=loss,
+            masks_queries_logits=masks_queries_logits,
+            class_queries_logits=class_queries_logits,
+            last_hidden_state=sequence_output,
+            patch_offsets=patch_offsets,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def predict(self, logits: torch.Tensor):
+        query_tokens = logits[:, : self.config.num_queries, :]
+        class_logits = self.class_predictor(query_tokens)
+
+        prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :]
+        prefix_tokens = prefix_tokens.transpose(1, 2)
+
+        prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size)
+
+        query_tokens = self.mask_head(query_tokens)
+        prefix_tokens = self.upscale_block(prefix_tokens)
+
+        mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens)
+
+        return mask_logits, class_logits
+
+    @staticmethod
+    def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device):
+        if prob < 1:
+            # Generate random queries to disable based on the probs
+            random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob
+
+            # Disable attention to the query tokens, considering the prefix tokens
+            attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1
+
+        return attn_mask
+
+
+__all__ = ["EomtPreTrainedModel", "EomtForUniversalSegmentation"]
diff --git a/phivenv/Lib/site-packages/transformers/models/eomt/modular_eomt.py b/phivenv/Lib/site-packages/transformers/models/eomt/modular_eomt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d9a6100a08277cd721ee2aecec17ee79d4a6445
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/eomt/modular_eomt.py
@@ -0,0 +1,602 @@
+# coding=utf-8
+# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch EoMT model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    logging,
+)
+from ...utils.generic import check_model_inputs
+from ..dinov2.modeling_dinov2 import (
+    Dinov2Embeddings,
+    Dinov2Layer,
+    Dinov2LayerScale,
+    Dinov2PatchEmbeddings,
+)
+from ..mask2former.modeling_mask2former import Mask2FormerForUniversalSegmentation, Mask2FormerLoss
+from ..siglip.modeling_siglip import SiglipAttention
+from ..vit.configuration_vit import ViTConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class EomtConfig(ViTConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the EoMT
+    [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads in each attention layer.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the MLP hidden dimensionality to the hidden size.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 640):
+            The size (resolution) of each input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+            Initial value for the LayerScale parameter.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The stochastic depth rate (drop path) used during training.
+        num_upscale_blocks (`int`, *optional*, defaults to 2):
+            Number of upsampling blocks used in the decoder or segmentation head.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability applied after attention projection.
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_blocks (`int`, *optional*, defaults to 4):
+            Number of feature blocks or stages in the architecture.
+        no_object_weight (`float`, *optional*, defaults to 0.1):
+            Loss weight for the 'no object' class in panoptic/instance segmentation.
+        class_weight (`float`, *optional*, defaults to 2.0):
+            Loss weight for classification targets.
+        mask_weight (`float`, *optional*, defaults to 5.0):
+            Loss weight for mask prediction.
+        dice_weight (`float`, *optional*, defaults to 5.0):
+            Loss weight for the dice loss component.
+        train_num_points (`int`, *optional*, defaults to 12544):
+            Number of points to sample for mask loss computation during training.
+        oversample_ratio (`float`, *optional*, defaults to 3.0):
+            Oversampling ratio used in point sampling for mask training.
+        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
+            Ratio of points to sample based on importance during training.
+        num_queries (`int`, *optional*, defaults to 200):
+            Number of object queries in the Transformer.
+        num_register_tokens (`int`, *optional*, defaults to 4):
+            Number of learnable register tokens added to the transformer input.
+
+    Example:
+
+    ```python
+    >>> from transformers import EomtConfig, EomtForUniversalSegmentation
+
+    >>> # Initialize configuration
+    >>> config = EomtConfig()
+
+    >>> # Initialize model
+    >>> model = EomtForUniversalSegmentation(config)
+
+    >>> # Access config
+    >>> config = model.config
+    ```"""
+
+    model_type = "eomt"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=640,
+        patch_size=16,
+        num_channels=3,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        num_upscale_blocks=2,
+        attention_dropout=0.0,
+        use_swiglu_ffn=False,
+        num_blocks=4,
+        no_object_weight: float = 0.1,
+        class_weight: float = 2.0,
+        mask_weight: float = 5.0,
+        dice_weight: float = 5.0,
+        train_num_points: int = 12544,
+        oversample_ratio: float = 3.0,
+        importance_sample_ratio: float = 0.75,
+        num_queries=200,
+        num_register_tokens=4,
+        **kwargs,
+    ):
+        super().__init__(
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            hidden_dropout_prob=hidden_dropout_prob,
+            hidden_act=hidden_act,
+            initializer_range=initializer_range,
+            layer_norm_eps=layer_norm_eps,
+            image_size=image_size,
+            patch_size=patch_size,
+            num_channels=num_channels,
+            **kwargs,
+        )
+
+        del self.intermediate_size
+        del self.qkv_bias
+        del self.pooler_act
+        del self.pooler_output_size
+        del self.encoder_stride
+        del self.attention_probs_dropout_prob
+
+        self.mlp_ratio = mlp_ratio
+        self.attention_dropout = attention_dropout
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.num_upscale_blocks = num_upscale_blocks
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.num_blocks = num_blocks
+        self.no_object_weight = no_object_weight
+        self.class_weight = class_weight
+        self.mask_weight = mask_weight
+        self.dice_weight = dice_weight
+        self.train_num_points = train_num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.num_queries = num_queries
+        self.num_register_tokens = num_register_tokens
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`EomtForUniversalSegmentationOutput`].
+
+    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
+    [`~EomtImageProcessor.post_process_instance_segmentation`] or
+    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
+    [`~EomtImageProcessor] for details regarding usage.
+    """
+)
+class EomtForUniversalSegmentationOutput(ModelOutput):
+    r"""
+    loss (`torch.Tensor`, *optional*):
+        The computed loss, returned when labels are present.
+    class_queries_logits (`torch.FloatTensor`):
+        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
+        query. Note the `+ 1` is needed because we incorporate the null class.
+    masks_queries_logits (`torch.FloatTensor`):
+        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
+        query.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        Last hidden states (final feature map) of the last layer.
+    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
+    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
+    patch_offsets (`list[torch.Tensor]`, *optional*):
+        list of tuples indicating the image index and start and end positions of patches for semantic segementation.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    class_queries_logits: Optional[torch.FloatTensor] = None
+    masks_queries_logits: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    patch_offsets: Optional[list[torch.Tensor]] = None
+
+
+class EomtLoss(Mask2FormerLoss):
+    pass
+
+
+class EomtPatchEmbeddings(Dinov2PatchEmbeddings):
+    pass
+
+
+class EomtEmbeddings(Dinov2Embeddings):
+    def __init__(self, config: EomtConfig) -> None:
+        nn.Module.__init__(self)
+
+        self.config = config
+        self.patch_size = config.patch_size
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+
+        self.patch_embeddings = EomtPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.num_prefix_tokens = 1 + config.num_register_tokens  # 1 for [CLS]
+        self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+        self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
+
+    def interpolate_pos_encoding(self):
+        raise AttributeError("Not needed for Eomt Model")
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, _, _, _ = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        register_tokens = self.register_tokens.expand(batch_size, -1, -1)
+
+        embeddings = embeddings + self.position_embeddings(self.position_ids)
+        embeddings = torch.cat([cls_tokens, register_tokens, embeddings], dim=1)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class EomtAttention(SiglipAttention):
+    pass
+
+
+class EomtLayerScale(Dinov2LayerScale):
+    pass
+
+
+class EomtLayer(Dinov2Layer):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states_norm = self.norm1(hidden_states)
+        self_attention_output, _ = self.attention(hidden_states_norm, head_mask)
+        self_attention_output = self.layer_scale1(self_attention_output)
+
+        # first residual connection
+        hidden_states = self.drop_path(self_attention_output) + hidden_states
+
+        # in Eomt, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        return layer_output
+
+
+class EomtLayerNorm2d(nn.LayerNorm):
+    def __init__(self, num_channels, eps=1e-6, affine=True):
+        super().__init__(num_channels, eps=eps, elementwise_affine=affine)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = hidden_state.permute(0, 2, 3, 1)
+        hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps)
+        hidden_state = hidden_state.permute(0, 3, 1, 2)
+        return hidden_state
+
+
+class EomtScaleLayer(nn.Module):
+    def __init__(self, config: EomtConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2)
+        self.activation = ACT2FN[config.hidden_act]
+        self.conv2 = nn.Conv2d(
+            hidden_size,
+            hidden_size,
+            kernel_size=3,
+            padding=1,
+            groups=hidden_size,
+            bias=False,
+        )
+
+        self.layernorm2d = EomtLayerNorm2d(hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layernorm2d(hidden_states)
+        return hidden_states
+
+
+class EomtScaleBlock(nn.Module):
+    def __init__(self, config: EomtConfig):
+        super().__init__()
+        self.num_blocks = config.num_upscale_blocks
+        self.block = nn.ModuleList([EomtScaleLayer(config) for _ in range(self.num_blocks)])
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for block in self.block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class EomtMaskHead(nn.Module):
+    def __init__(self, config: EomtConfig):
+        super().__init__()
+
+        hidden_size = config.hidden_size
+        self.fc1 = nn.Linear(hidden_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, hidden_size)
+        self.fc3 = nn.Linear(hidden_size, hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.activation(self.fc1(hidden_states))
+        hidden_states = self.activation(self.fc2(hidden_states))
+        hidden_states = self.fc3(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class EomtPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: EomtConfig
+    base_model_prefix = "eomt"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["EomtLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _can_record_outputs = {
+        "hidden_states": EomtLayer,
+        "attentions": EomtAttention,
+    }
+
+    def _init_weights(self, module: nn.Module) -> None:
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=1)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, EomtLayerScale):
+            if hasattr(module, "lambda1"):
+                module.lambda1.data.fill_(self.config.layerscale_value)
+        elif isinstance(module, EomtEmbeddings):
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32), mean=0.0, std=std
+            ).to(module.cls_token.dtype)
+            module.register_tokens.data.zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
+    """
+)
+class EomtForUniversalSegmentation(Mask2FormerForUniversalSegmentation):
+    def __init__(self, config: EomtConfig):
+        PreTrainedModel.__init__(self, config)
+        self.config = config
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embeddings = EomtEmbeddings(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.query = nn.Embedding(config.num_queries, config.hidden_size)
+        self.layers = nn.ModuleList([EomtLayer(config) for _ in range(config.num_hidden_layers)])
+
+        self.upscale_block = EomtScaleBlock(config)
+        self.mask_head = EomtMaskHead(config)
+
+        self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1)
+
+        self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size)
+        self.weight_dict: dict[str, float] = {
+            "loss_cross_entropy": config.class_weight,
+            "loss_mask": config.mask_weight,
+            "loss_dice": config.dice_weight,
+        }
+
+        self.criterion = EomtLoss(config=config, weight_dict=self.weight_dict)
+
+        self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks))
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def get_auxiliary_logits(self):
+        raise AttributeError("Note needed for Eomt Model.")
+
+    def predict(self, logits: torch.Tensor):
+        query_tokens = logits[:, : self.config.num_queries, :]
+        class_logits = self.class_predictor(query_tokens)
+
+        prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :]
+        prefix_tokens = prefix_tokens.transpose(1, 2)
+
+        prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size)
+
+        query_tokens = self.mask_head(query_tokens)
+        prefix_tokens = self.upscale_block(prefix_tokens)
+
+        mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens)
+
+        return mask_logits, class_logits
+
+    @staticmethod
+    def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device):
+        if prob < 1:
+            # Generate random queries to disable based on the probs
+            random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob
+
+            # Disable attention to the query tokens, considering the prefix tokens
+            attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1
+
+        return attn_mask
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Tensor,
+        mask_labels: Optional[list[Tensor]] = None,
+        class_labels: Optional[list[Tensor]] = None,
+        patch_offsets: Optional[list[Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> EomtForUniversalSegmentationOutput:
+        r"""
+        mask_labels (`list[torch.Tensor]`, *optional*):
+            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
+        class_labels (`list[torch.LongTensor]`, *optional*):
+            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
+            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
+        patch_offsets (`list[torch.Tensor]`, *optional*):
+            list of tuples indicating the image index and start and end positions of patches for semantic segementation.
+        """
+
+        masks_queries_logits_per_layer, class_queries_logits_per_layer = (), ()
+        attention_mask = None
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        for idx, layer_module in enumerate(self.layers):
+            if idx == self.num_hidden_layers - self.config.num_blocks:
+                query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device)
+                hidden_states = torch.cat((query, hidden_states), dim=1)
+
+            if idx >= self.num_hidden_layers - self.config.num_blocks and (
+                self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0
+            ):
+                norm_hidden_states = self.layernorm(hidden_states)
+                masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states)
+
+                masks_queries_logits_per_layer += (masks_queries_logits,)
+                class_queries_logits_per_layer += (class_queries_logits,)
+
+                attention_mask = torch.ones(
+                    hidden_states.shape[0],
+                    hidden_states.shape[1],
+                    hidden_states.shape[1],
+                    device=hidden_states.device,
+                    dtype=torch.bool,
+                )
+
+                interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear")
+                interpolated_logits = interpolated_logits.view(
+                    interpolated_logits.size(0), interpolated_logits.size(1), -1
+                )
+
+                num_query_tokens = self.config.num_queries
+                encoder_start_tokens = num_query_tokens + self.embeddings.num_prefix_tokens
+
+                # Set attention mask for queries to focus on encoder tokens based on interpolated logits
+                attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0
+
+                # Disable attention mask for random query tokens.
+                attention_mask = self._disable_attention_mask(
+                    attention_mask,
+                    prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks],
+                    num_query_tokens=num_query_tokens,
+                    encoder_start_tokens=encoder_start_tokens,
+                    device=attention_mask.device,
+                )
+
+                # Expand attention mask to 4d mask.
+                attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1)
+                attention_mask = attention_mask.float().masked_fill(~attention_mask, -1e9)
+
+            hidden_states = layer_module(hidden_states, attention_mask)
+
+        sequence_output = self.layernorm(hidden_states)
+
+        masks_queries_logits, class_queries_logits = self.predict(sequence_output)
+        masks_queries_logits_per_layer += (masks_queries_logits,)
+        class_queries_logits_per_layer += (class_queries_logits,)
+
+        loss = None
+        if mask_labels is not None and class_labels is not None:
+            loss = 0.0
+            for masks_queries_logits, class_queries_logits in zip(
+                masks_queries_logits_per_layer, class_queries_logits_per_layer
+            ):
+                loss_dict = self.get_loss_dict(
+                    masks_queries_logits=masks_queries_logits,
+                    class_queries_logits=class_queries_logits,
+                    mask_labels=mask_labels,
+                    class_labels=class_labels,
+                    auxiliary_predictions=None,
+                )
+                loss += self.get_loss(loss_dict)
+
+        return EomtForUniversalSegmentationOutput(
+            loss=loss,
+            masks_queries_logits=masks_queries_logits,
+            class_queries_logits=class_queries_logits,
+            last_hidden_state=sequence_output,
+            patch_offsets=patch_offsets,
+        )
+
+
+__all__ = ["EomtConfig", "EomtPreTrainedModel", "EomtForUniversalSegmentation"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie/__init__.py b/phivenv/Lib/site-packages/transformers/models/ernie/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bb8983063ddb0117e8b0d7cd6603aa6ac3056b6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ernie import *
+    from .modeling_ernie import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60fdd8c652546a861f914644eec74bc896a14963
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2aac1cb0b7b368b92741938c511f0b0bc6509af
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b39a38dd8313bf5fadad02c305264ee2e69c6efd
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie/configuration_ernie.py b/phivenv/Lib/site-packages/transformers/models/ernie/configuration_ernie.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf300f0ce51696dbdab756a6095689c8344ad08
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie/configuration_ernie.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ERNIE model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ErnieConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieModel`] or a [`TFErnieModel`]. It is used to
+    instantiate a ERNIE model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ERNIE
+    [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the ERNIE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ErnieModel`] or [`TFErnieModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ErnieModel`] or [`TFErnieModel`].
+        task_type_vocab_size (`int`, *optional*, defaults to 3):
+            The vocabulary size of the `task_type_ids` for ERNIE2.0/ERNIE3.0 model
+        use_task_id (`bool`, *optional*, defaults to `False`):
+            Whether or not the model support `task_type_ids`
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ErnieConfig, ErnieModel
+
+    >>> # Initializing a ERNIE nghuyong/ernie-3.0-base-zh style configuration
+    >>> configuration = ErnieConfig()
+
+    >>> # Initializing a model (with random weights) from the nghuyong/ernie-3.0-base-zh style configuration
+    >>> model = ErnieModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "ernie"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        task_type_vocab_size=3,
+        use_task_id=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.task_type_vocab_size = task_type_vocab_size
+        self.use_task_id = use_task_id
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class ErnieOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+                ("task_type_ids", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["ErnieConfig", "ErnieOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie/modeling_ernie.py b/phivenv/Lib/site-packages/transformers/models/ernie/modeling_ernie.py
new file mode 100644
index 0000000000000000000000000000000000000000..67ca76c97256801a89ca43ec561cfb931a485ece
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie/modeling_ernie.py
@@ -0,0 +1,1689 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ERNIE model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_ernie import ErnieConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class ErnieEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.use_task_id = config.use_task_id
+        if config.use_task_id:
+            self.task_type_embeddings = nn.Embedding(config.task_type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        task_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        # add `task_type_id` for ERNIE model
+        if self.use_task_id:
+            if task_type_ids is None:
+                task_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+            task_type_embeddings = self.task_type_embeddings(task_type_ids)
+            embeddings += task_type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Ernie
+class ErnieSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = self.query(hidden_states)
+        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = self.key(current_states)
+            key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+                1, 2
+            )
+            value_layer = self.value(current_states)
+            value_layer = value_layer.view(
+                batch_size, -1, self.num_attention_heads, self.attention_head_size
+            ).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if past_key_values is not None:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ErnieModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Ernie
+class ErnieSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+ERNIE_SELF_ATTENTION_CLASSES = {
+    "eager": ErnieSelfAttention,
+}
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie,BERT->ERNIE
+class ErnieAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        self.self = ERNIE_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config,
+            position_embedding_type=position_embedding_type,
+            layer_idx=layer_idx,
+        )
+        self.output = ErnieSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Ernie
+class ErnieIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Ernie
+class ErnieOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Ernie
+class ErnieLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ErnieAttention(config, layer_idx=layer_idx)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = ErnieAttention(config, position_embedding_type="absolute", layer_idx=layer_idx)
+        self.intermediate = ErnieIntermediate(config)
+        self.output = ErnieOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Ernie
+class ErnieEncoder(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ErnieLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and self.config.is_decoder and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Ernie
+class ErniePooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Ernie
+class ErniePredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Ernie
+class ErnieLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = ErniePredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Ernie
+class ErnieOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = ErnieLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->Ernie
+class ErnieOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->Ernie
+class ErniePreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = ErnieLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@auto_docstring
+class ErniePreTrainedModel(PreTrainedModel):
+    config: ErnieConfig
+    base_model_prefix = "ernie"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`ErnieForPreTraining`].
+    """
+)
+# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->Ernie
+class ErnieForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the masked language modeling loss and the next sequence prediction
+        (classification) loss.
+    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+        before SoftMax).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    """
+)
+class ErnieModel(ErniePreTrainedModel):
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Ernie
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ErnieEmbeddings(config)
+        self.encoder = ErnieEncoder(config)
+
+        self.pooler = ErniePooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Ernie Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """
+)
+class ErnieForPreTraining(ErniePreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ernie = ErnieModel(config)
+        self.cls = ErniePreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], ErnieForPreTrainingOutput]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ErnieForPreTraining
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
+        >>> model = ErnieForPreTraining.from_pretrained("nghuyong/ernie-1.0-base-zh")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return ErnieForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Ernie Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+class ErnieForCausalLM(ErniePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `ErnieForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.ernie = ErnieModel(config, add_pooling_layer=False)
+        self.cls = ErnieOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@auto_docstring
+class ErnieForMaskedLM(ErniePreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `ErnieForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.ernie = ErnieModel(config, add_pooling_layer=False)
+        self.cls = ErnieOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+    @classmethod
+    def can_generate(cls) -> bool:
+        """
+        Legacy correction: ErnieForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
+        `prepare_inputs_for_generation` method.
+        """
+        return False
+
+
+@auto_docstring(
+    custom_intro="""
+    Ernie Model with a `next sentence prediction (classification)` head on top.
+    """
+)
+class ErnieForNextSentencePrediction(ErniePreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForNextSentencePrediction.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ernie = ErnieModel(config)
+        self.cls = ErnieOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], NextSentencePredictorOutput]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ErnieForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
+        >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-1.0-base-zh")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Ernie Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """
+)
+class ErnieForSequenceClassification(ErniePreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.ernie = ErnieModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class ErnieForMultipleChoice(ErniePreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ernie = ErnieModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class ErnieForTokenClassification(ErniePreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie = ErnieModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class ErnieForQuestionAnswering(ErniePreTrainedModel):
+    # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie = ErnieModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        task_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Task type embedding is a special embedding to represent the characteristic of different tasks, such as
+            word-aware pre-training task, structure-aware pre-training task and semantic-aware pre-training task. We
+            assign a `task_type_id` to each task and the `task_type_id` is in the range `[0,
+            config.task_type_vocab_size-1]
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "ErnieForCausalLM",
+    "ErnieForMaskedLM",
+    "ErnieForMultipleChoice",
+    "ErnieForNextSentencePrediction",
+    "ErnieForPreTraining",
+    "ErnieForQuestionAnswering",
+    "ErnieForSequenceClassification",
+    "ErnieForTokenClassification",
+    "ErnieModel",
+    "ErniePreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/__init__.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6e69432c9a8ac0471c3dd91660eaf912dc149c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ernie4_5 import *
+    from .modeling_ernie4_5 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..545323745c458625ea14498207994250c03d5aa2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/configuration_ernie4_5.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/configuration_ernie4_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dbda2fe81ea8b26fc9a153bf32fbf29b3212868
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/configuration_ernie4_5.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/modeling_ernie4_5.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/modeling_ernie4_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..045a59c4462725db493c9cc3c538a646721ec408
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/modeling_ernie4_5.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/modular_ernie4_5.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/modular_ernie4_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe9c91e75165a9d8dcc2c556d59106debdab5b83
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5/__pycache__/modular_ernie4_5.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/configuration_ernie4_5.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5/configuration_ernie4_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e2795b5daa2bcd2a873997ed8890fa5703b115
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5/configuration_ernie4_5.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie 4.5 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class Ernie4_5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Ernie4_5Model`]. It is used to instantiate an Ernie 4.5
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Ernie 4.5 0.3B.
+    e.g. [baidu/ERNIE-4.5-0.3B-PT](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 103424):
+            Vocabulary size of the Ernie 4.5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Ernie4_5Model`]
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 18):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 500000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in any of the projections including mlp and attention for example.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
+
+    ```python
+    >>> from transformers import Ernie4_5Model, Ernie4_5Config
+
+    >>> # Initializing a Ernie4_5 0.3B style configuration
+    >>> configuration = Ernie4_5Config()
+
+    >>> # Initializing a model from the 0.3B style configuration
+    >>> model = Ernie4_5Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "ernie4_5"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Ernie4_5Model`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=103424,
+        hidden_size=1024,
+        intermediate_size=3072,
+        num_hidden_layers=18,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=500000.0,
+        rope_scaling=None,
+        use_bias=False,
+        head_dim=128,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.use_bias = use_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Ernie4_5Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/modeling_ernie4_5.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5/modeling_ernie4_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ec6fb3a3b619e2e6315745ae3a6f37038fdbcd
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5/modeling_ernie4_5.py
@@ -0,0 +1,471 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/ernie4_5/modular_ernie4_5.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_ernie4_5.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_ernie4_5 import Ernie4_5Config
+
+
+class Ernie4_5RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Ernie4_5Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        # keeping it in full precision
+        return cos, sin
+
+
+class Ernie4_5MLP(nn.Module):
+    def __init__(self, config: Ernie4_5Config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    # glm rope style (with full dim) and full precision
+    original_dtype = q.dtype
+
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    q_embed = (q.float() * cos) + (rotate_half(q).float() * sin)
+    k_embed = (k.float() * cos) + (rotate_half(k).float() * sin)
+
+    return q_embed.to(original_dtype), k_embed.to(original_dtype)
+
+
+class Ernie4_5Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Ernie4_5Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.attention_dropout = 0.0
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Ernie4_5RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Ernie4_5RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Ernie4_5DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Ernie4_5Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Ernie4_5Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Ernie4_5MLP(config)
+        self.input_layernorm = Ernie4_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Ernie4_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class Ernie4_5PreTrainedModel(PreTrainedModel):
+    config: Ernie4_5Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Ernie4_5DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Ernie4_5DecoderLayer,
+        "attentions": Ernie4_5Attention,
+    }
+
+
+@auto_docstring
+class Ernie4_5Model(Ernie4_5PreTrainedModel):
+    def __init__(self, config: Ernie4_5Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Ernie4_5DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Ernie4_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Ernie4_5RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Ernie4_5ForCausalLM(Ernie4_5PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Ernie4_5Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["Ernie4_5ForCausalLM", "Ernie4_5Model", "Ernie4_5PreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5/modular_ernie4_5.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5/modular_ernie4_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cec0232ca6842e72b453f53560ff5da017f794d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5/modular_ernie4_5.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Ernie 4.5 model"""
+
+import torch
+from torch import nn
+
+from ...modeling_rope_utils import dynamic_rope_update
+from ...utils import auto_docstring, can_return_tuple
+from ..glm.modeling_glm import rotate_half
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaForCausalLM,
+    LlamaMLP,
+    LlamaRotaryEmbedding,
+)
+from .configuration_ernie4_5 import Ernie4_5Config
+
+
+class Ernie4_5RotaryEmbedding(LlamaRotaryEmbedding):
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        # keeping it in full precision
+        return cos, sin
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    # glm rope style (with full dim) and full precision
+    original_dtype = q.dtype
+
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    q_embed = (q.float() * cos) + (rotate_half(q).float() * sin)
+    k_embed = (k.float() * cos) + (rotate_half(k).float() * sin)
+
+    return q_embed.to(original_dtype), k_embed.to(original_dtype)
+
+
+class Ernie4_5MLP(LlamaMLP):
+    def __init__(self, config: Ernie4_5Config):
+        super().__init__(config)
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+
+
+class Ernie4_5Attention(LlamaAttention):
+    def __init__(self, config: Ernie4_5Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+        self.attention_dropout = 0.0
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
+
+
+class Ernie4_5ForCausalLM(LlamaForCausalLM):
+    @can_return_tuple
+    @auto_docstring
+    def forward(self, **super_kwargs):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "Ernie4_5ForCausalLM",
+    "Ernie4_5Model",  # noqa: F822
+    "Ernie4_5PreTrainedModel",  # noqa: F822
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__init__.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb30318fa25ec560d549b08421ea08f02b0dce6f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ernie4_5_moe import *
+    from .modeling_ernie4_5_moe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..243d7ad315122fe98791fa31b960d0e29b0c6c56
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..980598cdf70f33c94b784d0c502de77c124d1028
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42e3801f2e250185075425e21d53939f2cdc17cf
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbdd8dc6a718a038f560798e9d782347d8380529
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..294ccfc638cf9683f15f94c3d2faee2a5f19cd44
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie 4.5 MoE model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Ernie4_5_MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Ernie4_5_MoeModel`]. It is used to instantiate a
+    Ernie 4.5 MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of [baidu/ERNIE-4.5-21B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 103424):
+            Vocabulary size of the Ernie 4.5 MoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Ernie4_5_MoeModel`]
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 12288):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 500000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in any of the projections including mlp and attention for example.
+        moe_intermediate_size (`int`, *optional*, defaults to 1536):
+            Intermediate size of the routed expert.
+        moe_k (`int`, *optional*, defaults to 6):
+            Number of selected experts.
+        moe_num_experts (`int`, *optional*, defaults to 64):
+            Number of routed experts.
+        moe_num_shared_experts (`int`, *optional*, defaults to 2):
+            The number of experts that are shared for all MoE forwards.
+        moe_layer_start_index (`int`, *optional*, defaults to 1):
+            The first index at which MoE layers start to appear.
+        moe_layer_end_index (`int`, *optional*, defaults to -1):
+            The last possible index for a MoE layer.
+        moe_layer_interval (`int`, *optional*, defaults to 1):
+            The intervals between MoE layers to appear.
+        moe_norm_min (`float`, *optional*, defaults to 1e-12):
+            Minimum division value during routing normalization.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import Ernie4_5_MoeModel, Ernie4_5_MoEConfig
+
+    >>> # Initializing a Ernie4_5_MoE style configuration
+    >>> configuration = Ernie4_5_MoEConfig()
+
+    >>> # Initializing a model from the ERNIE-4.5-21B-A3B style configuration
+    >>> model = Ernie4_5_MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "ernie4_5_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_experts": "moe_num_experts", "num_experts_per_tok": "moe_k"}
+
+    # Default tensor parallel plan for base model `Ernie4_5_MoE`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        # sequence parallel is pretty slow
+        # "norm.weight": "sequence_parallel",
+        # "layers.*.input_layernorm.weight": "sequence_parallel",
+        # "layers.*.post_attention_layernorm.weight": "sequence_parallel",
+        "layers.*.mlp.shared_experts.gate_proj": "local_colwise",
+        "layers.*.mlp.shared_experts.up_proj": "local_colwise",
+        "layers.*.mlp.shared_experts.down_proj": "local_rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "local_colwise",
+        "layers.*.mlp.experts.*.up_proj": "local_colwise",
+        "layers.*.mlp.experts.*.down_proj": "local_rowwise",
+        "layers.*.mlp.experts": "local",
+        "layers.*.mlp.gate_proj": "local_colwise",
+        "layers.*.mlp.up_proj": "local_colwise",
+        "layers.*.mlp.down_proj": "local_rowwise",
+        "layers.*.mlp": "gather",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=103424,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_size=2560,
+        intermediate_size=12288,
+        num_hidden_layers=28,
+        num_attention_heads=20,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=True,
+        rope_theta=500000.0,
+        rope_scaling=None,
+        use_bias=False,
+        moe_intermediate_size=1536,
+        moe_k=6,
+        moe_num_experts=64,
+        moe_num_shared_experts=2,
+        moe_layer_start_index=1,
+        moe_layer_end_index=-1,
+        moe_layer_interval=1,
+        moe_norm_min=1e-12,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_bias = use_bias
+
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # MoE arguments
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_k = moe_k
+        self.moe_num_experts = moe_num_experts
+        self.moe_num_shared_experts = moe_num_shared_experts
+        self.moe_layer_start_index = moe_layer_start_index
+        self.moe_layer_end_index = self.num_hidden_layers - 1 if moe_layer_end_index == -1 else moe_layer_end_index
+        self.moe_layer_interval = moe_layer_interval
+        self.moe_norm_min = moe_norm_min
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Ernie4_5_MoeConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..36f1f25c32be12663f8c386be5a30527b4f40f02
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
@@ -0,0 +1,749 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_ernie4_5_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder, check_model_inputs
+from .configuration_ernie4_5_moe import Ernie4_5_MoeConfig
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Ernie4_5_MoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Ernie4_5_MoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Ernie4_5_MoeMLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Ernie4_5_MoeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Ernie4_5_MoeConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        # keeping it in full precision
+        return cos, sin
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    # glm rope style (with full dim) and full precision
+    original_dtype = q.dtype
+
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    q_embed = (q.float() * cos) + (rotate_half(q).float() * sin)
+    k_embed = (k.float() * cos) + (rotate_half(k).float() * sin)
+
+    return q_embed.to(original_dtype), k_embed.to(original_dtype)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Ernie4_5_MoeAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Ernie4_5_MoeConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.attention_dropout = 0.0
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Ernie4_5_MoeStatics(nn.Module):
+    """
+    Stores MoE (Mixture of Experts) statistics
+        - Bias for the gating
+        - Additionally, usage per expert in the original codebase
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        num_experts_groups = 1
+        num_experts = config.moe_num_experts
+
+        self.e_score_correction_bias = nn.Parameter(
+            torch.zeros(num_experts_groups, num_experts, dtype=torch.float32),
+            requires_grad=False,
+        )
+
+    def forward(self, hidden_states):
+        # NOTE: This is a workaround to enable TP with a module that only has parameters
+        #
+        # Otherwise, it stays as `DTensor` when called in the "super" forward
+        #   1. All other tensors are local (`torch.Tensor`)
+        #   2. Isolate does not work on `nn.Module` which only has parameters
+        return hidden_states + self.e_score_correction_bias.squeeze()
+
+
+class Ernie4_5_MoeSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accommodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+
+    Ernie 4.5 MoE's original formula is based on case (2) with
+    (optional) shared experts and a corrections bias during gating.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.moe_num_experts
+        self.top_k = config.moe_k
+
+        # correction bias (yes it seems to be a typo with statics <> statistics)
+        self.moe_statics = Ernie4_5_MoeStatics(config)
+
+        # gating
+        self.gate = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList(
+            [Ernie4_5_MoeMLP(config, config.moe_intermediate_size) for _ in range(config.moe_num_experts)]
+        )
+        self.norm_min = config.moe_norm_min
+
+        # (optional) shared experts for all forwards
+        self.shared_experts = None
+        if config.moe_num_shared_experts > 0:
+            self.shared_experts = Ernie4_5_MoeMLP(config, config.moe_intermediate_size * config.moe_num_shared_experts)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # (Optional) shared experts
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+
+        device_type = (
+            hidden_states.device.type
+            if isinstance(hidden_states.device.type, str) and hidden_states.device.type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            # router_logits: (batch * sequence_length, n_experts)
+            router_logits = self.gate(hidden_states.float())
+
+            routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+            _, selected_experts = torch.topk(self.moe_statics(routing_weights), self.top_k, dim=-1)
+            routing_weights = torch.gather(routing_weights, dim=-1, index=selected_experts)
+            routing_weights = routing_weights / torch.clamp(
+                routing_weights.sum(dim=-1, keepdim=True), min=self.norm_min
+            )
+            routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+
+        # Add (optional) shared experts to the result
+        if self.shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class Ernie4_5_MoeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Ernie4_5_MoeAttention(config, layer_idx)
+
+        if (
+            ((layer_idx + 1) % config.moe_layer_interval == 0)
+            and layer_idx >= config.moe_layer_start_index
+            and layer_idx <= config.moe_layer_end_index
+        ):
+            self.mlp = Ernie4_5_MoeSparseMoeBlock(config)
+        else:
+            self.mlp = Ernie4_5_MoeMLP(config)
+
+        self.input_layernorm = Ernie4_5_MoeRMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.post_attention_layernorm = Ernie4_5_MoeRMSNorm(config.hidden_size, config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # For the MoE layers, we need to unpack
+        if isinstance(hidden_states, tuple):
+            hidden_states, _ = hidden_states
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+@auto_docstring
+class Ernie4_5_MoePreTrainedModel(PreTrainedModel):
+    config: Ernie4_5_MoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Ernie4_5_MoeDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(Ernie4_5_MoeSparseMoeBlock, index=1),
+        "hidden_states": Ernie4_5_MoeDecoderLayer,
+        "attentions": Ernie4_5_MoeAttention,
+    }
+    _keep_in_fp32_modules_strict = ["gate", "moe_statics"]
+    # Not supporting multi-token prediction (MTP) atm
+    _keys_to_ignore_on_load_unexpected = ["mtp"]
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Ernie4_5_MoeStatics):
+            module.e_score_correction_bias.data.zero_()
+
+
+@auto_docstring
+class Ernie4_5_MoeModel(Ernie4_5_MoePreTrainedModel):
+    def __init__(self, config: Ernie4_5_MoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Ernie4_5_MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Ernie4_5_MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Ernie4_5_MoeRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return MoeModelOutputWithPast(  # only diff with Mistral is the output type, we need MoE
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+@auto_docstring
+class Ernie4_5_MoeForCausalLM(Ernie4_5_MoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Ernie4_5_MoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=config.use_bias)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.moe_num_experts
+        self.num_experts_per_tok = config.moe_k
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_router_logits=output_router_logits,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+__all__ = ["Ernie4_5_MoeForCausalLM", "Ernie4_5_MoeModel", "Ernie4_5_MoePreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bddb745df007f85fbbd6f72eacc723035d021c0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Ernie 4.5 MoE model."""
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...cache_utils import Cache, DynamicCache
+from ...masking_utils import create_causal_mask
+from ...modeling_outputs import MoeModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.generic import OutputRecorder, check_model_inputs
+from ..ernie4_5.modeling_ernie4_5 import Ernie4_5RotaryEmbedding, apply_rotary_pos_emb, rotate_half  # noqa: F401
+from ..llama.modeling_llama import LlamaAttention, LlamaRMSNorm
+from ..mixtral.modeling_mixtral import (
+    MixtralForCausalLM,
+    MixtralPreTrainedModel,
+)
+from ..qwen3_moe.modeling_qwen3_moe import Qwen3MoeDecoderLayer, Qwen3MoeMLP
+from .configuration_ernie4_5_moe import Ernie4_5_MoeConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Ernie4_5_MoeRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class Ernie4_5_MoeMLP(Qwen3MoeMLP):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__(config, intermediate_size)
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+
+
+class Ernie4_5_MoeRotaryEmbedding(Ernie4_5RotaryEmbedding):
+    def __init__(self, config: Ernie4_5_MoeConfig, device=None):
+        super().__init__(config, device)
+
+
+class Ernie4_5_MoeAttention(LlamaAttention):
+    def __init__(self, config: Ernie4_5_MoeConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+        self.attention_dropout = 0.0
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
+
+
+class Ernie4_5_MoeStatics(nn.Module):
+    """
+    Stores MoE (Mixture of Experts) statistics
+        - Bias for the gating
+        - Additionally, usage per expert in the original codebase
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        num_experts_groups = 1
+        num_experts = config.moe_num_experts
+
+        self.e_score_correction_bias = nn.Parameter(
+            torch.zeros(num_experts_groups, num_experts, dtype=torch.float32),
+            requires_grad=False,
+        )
+
+    def forward(self, hidden_states):
+        # NOTE: This is a workaround to enable TP with a module that only has parameters
+        #
+        # Otherwise, it stays as `DTensor` when called in the "super" forward
+        #   1. All other tensors are local (`torch.Tensor`)
+        #   2. Isolate does not work on `nn.Module` which only has parameters
+        return hidden_states + self.e_score_correction_bias.squeeze()
+
+
+class Ernie4_5_MoeSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accommodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+
+    Ernie 4.5 MoE's original formula is based on case (2) with
+    (optional) shared experts and a corrections bias during gating.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.moe_num_experts
+        self.top_k = config.moe_k
+
+        # correction bias (yes it seems to be a typo with statics <> statistics)
+        self.moe_statics = Ernie4_5_MoeStatics(config)
+
+        # gating
+        self.gate = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList(
+            [Ernie4_5_MoeMLP(config, config.moe_intermediate_size) for _ in range(config.moe_num_experts)]
+        )
+        self.norm_min = config.moe_norm_min
+
+        # (optional) shared experts for all forwards
+        self.shared_experts = None
+        if config.moe_num_shared_experts > 0:
+            self.shared_experts = Ernie4_5_MoeMLP(config, config.moe_intermediate_size * config.moe_num_shared_experts)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # (Optional) shared experts
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+
+        device_type = (
+            hidden_states.device.type
+            if isinstance(hidden_states.device.type, str) and hidden_states.device.type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            # router_logits: (batch * sequence_length, n_experts)
+            router_logits = self.gate(hidden_states.float())
+
+            routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+            _, selected_experts = torch.topk(self.moe_statics(routing_weights), self.top_k, dim=-1)
+            routing_weights = torch.gather(routing_weights, dim=-1, index=selected_experts)
+            routing_weights = routing_weights / torch.clamp(
+                routing_weights.sum(dim=-1, keepdim=True), min=self.norm_min
+            )
+            routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+
+        # Add (optional) shared experts to the result
+        if self.shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class Ernie4_5_MoeDecoderLayer(Qwen3MoeDecoderLayer):
+    def __init__(self, config, layer_idx):
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Ernie4_5_MoeAttention(config, layer_idx)
+
+        if (
+            ((layer_idx + 1) % config.moe_layer_interval == 0)
+            and layer_idx >= config.moe_layer_start_index
+            and layer_idx <= config.moe_layer_end_index
+        ):
+            self.mlp = Ernie4_5_MoeSparseMoeBlock(config)
+        else:
+            self.mlp = Ernie4_5_MoeMLP(config)
+
+        self.input_layernorm = Ernie4_5_MoeRMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.post_attention_layernorm = Ernie4_5_MoeRMSNorm(config.hidden_size, config.rms_norm_eps)
+
+
+@auto_docstring
+class Ernie4_5_MoePreTrainedModel(MixtralPreTrainedModel):
+    config: Ernie4_5_MoeConfig
+    _no_split_modules = ["Ernie4_5_MoeDecoderLayer"]
+    _keep_in_fp32_modules_strict = ["gate", "moe_statics"]
+    # Not supporting multi-token prediction (MTP) atm
+    _keys_to_ignore_on_load_unexpected = ["mtp"]
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(Ernie4_5_MoeSparseMoeBlock, index=1),
+        "hidden_states": Ernie4_5_MoeDecoderLayer,
+        "attentions": Ernie4_5_MoeAttention,
+    }
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, Ernie4_5_MoeStatics):
+            module.e_score_correction_bias.data.zero_()
+
+
+@auto_docstring
+class Ernie4_5_MoeModel(Ernie4_5_MoePreTrainedModel):
+    def __init__(self, config: Ernie4_5_MoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Ernie4_5_MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Ernie4_5_MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Ernie4_5_MoeRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return MoeModelOutputWithPast(  # only diff with Mistral is the output type, we need MoE
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Ernie4_5_MoeForCausalLM(MixtralForCausalLM):
+    def __init__(self, config):
+        PreTrainedModel.__init__(self, config)
+        self.model = Ernie4_5_MoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=config.use_bias)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.moe_num_experts
+        self.num_experts_per_tok = config.moe_k
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(self, **super_kwargs):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "Ernie4_5_MoeForCausalLM",
+    "Ernie4_5_MoeModel",
+    "Ernie4_5_MoePreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/__init__.py b/phivenv/Lib/site-packages/transformers/models/esm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eac54d6ddcbdae2b8ca3771ae5540522f6f29da
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_esm import *
+    from .modeling_esm import *
+    from .modeling_esmfold import *
+    from .modeling_tf_esm import *
+    from .tokenization_esm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..595f036a8029574ceb4bceecef809d3017aec4a6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/configuration_esm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/configuration_esm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4d4957fa1f67ff8a6e1e64ccb414a52313f89f2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/configuration_esm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32d3d856496352af910044d8c1b66755c7e69363
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esmfold.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esmfold.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdf54cc1c5b6ddc4689868f2212f3c4bfafa6a4e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_esmfold.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_tf_esm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_tf_esm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08aa780b055d0f58f0eecbbb8107ef3dede6b260
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/modeling_tf_esm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/tokenization_esm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/tokenization_esm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90d1f3187fdd2df03427e6fc84b7690f810e58f4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/__pycache__/tokenization_esm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/configuration_esm.py b/phivenv/Lib/site-packages/transformers/models/esm/configuration_esm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fabfb4ebd6d34a7f212af5e74a90c18d4a038156
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/configuration_esm.py
@@ -0,0 +1,365 @@
+# coding=utf-8
+# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ESM model configuration"""
+
+from dataclasses import asdict, dataclass
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+# TODO Update this
+
+
+class EsmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ESMModel`]. It is used to instantiate a ESM model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ESM
+    [facebook/esm-1b](https://huggingface.co/facebook/esm-1b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*):
+            Vocabulary size of the ESM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ESMModel`].
+        mask_token_id (`int`, *optional*):
+            The index of the mask token in the vocabulary. This must be included in the config because of the
+            "mask-dropout" scaling trick, which will scale the inputs depending on the number of masked tokens.
+        pad_token_id (`int`, *optional*):
+            The index of the padding token in the vocabulary. This must be included in the config because certain parts
+            of the ESM code use this instead of the attention mask.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 1026):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query", "rotary"`.
+            For positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        emb_layer_norm_before (`bool`, *optional*):
+            Whether to apply layer normalization after embeddings but before the main stem of the network.
+        token_dropout (`bool`, defaults to `False`):
+            When this is enabled, masked tokens are treated as if they had been dropped out by input dropout.
+
+    Examples:
+
+    ```python
+    >>> from transformers import EsmModel, EsmConfig
+
+    >>> # Initializing a ESM facebook/esm-1b style configuration
+    >>> configuration = EsmConfig(vocab_size=33)
+
+    >>> # Initializing a model from the configuration
+    >>> model = EsmModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "esm"
+
+    def __init__(
+        self,
+        vocab_size=None,
+        mask_token_id=None,
+        pad_token_id=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=1026,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        position_embedding_type="absolute",
+        use_cache=True,
+        emb_layer_norm_before=None,
+        token_dropout=False,
+        is_folding_model=False,
+        esmfold_config=None,
+        vocab_list=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.emb_layer_norm_before = emb_layer_norm_before
+        self.token_dropout = token_dropout
+        self.is_folding_model = is_folding_model
+        if is_folding_model:
+            if esmfold_config is None:
+                logger.info("No esmfold_config supplied for folding model, using default values.")
+                esmfold_config = EsmFoldConfig()
+            elif isinstance(esmfold_config, dict):
+                esmfold_config = EsmFoldConfig(**esmfold_config)
+            self.esmfold_config = esmfold_config
+            if vocab_list is None:
+                logger.warning("No vocab_list supplied for folding model, assuming the ESM-2 vocabulary!")
+                self.vocab_list = get_default_vocab_list()
+            else:
+                self.vocab_list = vocab_list
+        else:
+            self.esmfold_config = None
+            self.vocab_list = None
+        if self.esmfold_config is not None and getattr(self.esmfold_config, "use_esm_attn_map", False):
+            raise ValueError("The HuggingFace port of ESMFold does not support use_esm_attn_map at this time!")
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = super().to_dict()
+        if isinstance(self.esmfold_config, EsmFoldConfig):
+            output["esmfold_config"] = self.esmfold_config.to_dict()
+        return output
+
+
+@dataclass
+class EsmFoldConfig:
+    esm_type: Optional[str] = None
+    fp16_esm: bool = True
+    use_esm_attn_map: bool = False
+    esm_ablate_pairwise: bool = False
+    esm_ablate_sequence: bool = False
+    esm_input_dropout: float = 0
+
+    embed_aa: bool = True
+    bypass_lm: bool = False
+
+    lddt_head_hid_dim: int = 128
+    trunk: "TrunkConfig" = None
+
+    def __post_init__(self):
+        if self.trunk is None:
+            self.trunk = TrunkConfig()
+        elif isinstance(self.trunk, dict):
+            self.trunk = TrunkConfig(**self.trunk)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = asdict(self)
+        output["trunk"] = self.trunk.to_dict()
+        return output
+
+
+@dataclass
+class TrunkConfig:
+    num_blocks: int = 48
+    sequence_state_dim: int = 1024
+    pairwise_state_dim: int = 128
+    sequence_head_width: int = 32
+    pairwise_head_width: int = 32
+    position_bins: int = 32
+    dropout: float = 0
+    layer_drop: float = 0
+    cpu_grad_checkpoint: bool = False
+    max_recycles: int = 4
+    chunk_size: Optional[int] = 128
+    structure_module: "StructureModuleConfig" = None
+
+    def __post_init__(self):
+        if self.structure_module is None:
+            self.structure_module = StructureModuleConfig()
+        elif isinstance(self.structure_module, dict):
+            self.structure_module = StructureModuleConfig(**self.structure_module)
+
+        if self.max_recycles <= 0:
+            raise ValueError(f"`max_recycles` should be positive, got {self.max_recycles}.")
+        if self.sequence_state_dim % self.sequence_state_dim != 0:
+            raise ValueError(
+                "`sequence_state_dim` should be a round multiple of `sequence_state_dim`, got"
+                f" {self.sequence_state_dim} and {self.sequence_state_dim}."
+            )
+        if self.pairwise_state_dim % self.pairwise_state_dim != 0:
+            raise ValueError(
+                "`pairwise_state_dim` should be a round multiple of `pairwise_state_dim`, got"
+                f" {self.pairwise_state_dim} and {self.pairwise_state_dim}."
+            )
+
+        sequence_num_heads = self.sequence_state_dim // self.sequence_head_width
+        pairwise_num_heads = self.pairwise_state_dim // self.pairwise_head_width
+
+        if self.sequence_state_dim != sequence_num_heads * self.sequence_head_width:
+            raise ValueError(
+                "`sequence_state_dim` should be equal to `sequence_num_heads * sequence_head_width, got"
+                f" {self.sequence_state_dim} != {sequence_num_heads} * {self.sequence_head_width}."
+            )
+        if self.pairwise_state_dim != pairwise_num_heads * self.pairwise_head_width:
+            raise ValueError(
+                "`pairwise_state_dim` should be equal to `pairwise_num_heads * pairwise_head_width, got"
+                f" {self.pairwise_state_dim} != {pairwise_num_heads} * {self.pairwise_head_width}."
+            )
+        if self.pairwise_state_dim % 2 != 0:
+            raise ValueError(f"`pairwise_state_dim` should be even, got {self.pairwise_state_dim}.")
+
+        if self.dropout >= 0.4:
+            raise ValueError(f"`dropout` should not be greater than 0.4, got {self.dropout}.")
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = asdict(self)
+        output["structure_module"] = self.structure_module.to_dict()
+        return output
+
+
+@dataclass
+class StructureModuleConfig:
+    """
+    Args:
+        sequence_dim:
+            Single representation channel dimension
+        pairwise_dim:
+            Pair representation channel dimension
+        ipa_dim:
+            IPA hidden channel dimension
+        resnet_dim:
+            Angle resnet (Alg. 23 lines 11-14) hidden channel dimension
+        num_heads_ipa:
+            Number of IPA heads
+        num_qk_points:
+            Number of query/key points to generate during IPA
+        num_v_points:
+            Number of value points to generate during IPA
+        dropout_rate:
+            Dropout rate used throughout the layer
+        num_blocks:
+            Number of structure module blocks
+        num_transition_layers:
+            Number of layers in the single representation transition (Alg. 23 lines 8-9)
+        num_resnet_blocks:
+            Number of blocks in the angle resnet
+        num_angles:
+            Number of angles to generate in the angle resnet
+        trans_scale_factor:
+            Scale of single representation transition hidden dimension
+        epsilon:
+            Small number used in angle resnet normalization
+        inf:
+            Large number used for attention masking
+    """
+
+    sequence_dim: int = 384
+    pairwise_dim: int = 128
+    ipa_dim: int = 16
+    resnet_dim: int = 128
+    num_heads_ipa: int = 12
+    num_qk_points: int = 4
+    num_v_points: int = 8
+    dropout_rate: float = 0.1
+    num_blocks: int = 8
+    num_transition_layers: int = 1
+    num_resnet_blocks: int = 2
+    num_angles: int = 7
+    trans_scale_factor: int = 10
+    epsilon: float = 1e-8
+    inf: float = 1e5
+
+    def to_dict(self):
+        return asdict(self)
+
+
+def get_default_vocab_list():
+    return (
+        "<cls>",
+        "<pad>",
+        "<eos>",
+        "<unk>",
+        "L",
+        "A",
+        "G",
+        "V",
+        "S",
+        "E",
+        "R",
+        "T",
+        "I",
+        "D",
+        "P",
+        "K",
+        "Q",
+        "N",
+        "F",
+        "Y",
+        "M",
+        "H",
+        "W",
+        "C",
+        "X",
+        "B",
+        "U",
+        "Z",
+        "O",
+        ".",
+        "-",
+        "<null_1>",
+        "<mask>",
+    )
+
+
+__all__ = ["EsmConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/modeling_esm.py b/phivenv/Lib/site-packages/transformers/models/esm/modeling_esm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5db366aa61974d93ced97c6103821bea59d97c3c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/modeling_esm.py
@@ -0,0 +1,1059 @@
+# coding=utf-8
+# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ESM model."""
+
+import math
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.generic import OutputRecorder, check_model_inputs
+from .configuration_esm import EsmConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(x, cos, sin):
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+def gelu(x):
+    """
+    This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def symmetrize(x):
+    "Make layer symmetric in final two dimensions, used for contact prediction."
+    return x + x.transpose(-1, -2)
+
+
+def average_product_correct(x):
+    "Perform average product correct, used for contact prediction."
+    a1 = x.sum(-1, keepdims=True)
+    a2 = x.sum(-2, keepdims=True)
+    a12 = x.sum((-1, -2), keepdims=True)
+
+    avg = a1 * a2
+    avg.div_(a12)  # in-place to reduce memory
+    normalized = x - avg
+    return normalized
+
+
+class RotaryEmbedding(torch.nn.Module):
+    """
+    Rotary position embeddings based on those in
+    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
+    matrices which depend on their relative positions.
+    """
+
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        inv_freq = inv_freq
+        self.register_buffer("inv_freq", inv_freq)
+
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+
+    def _update_cos_sin_tables(self, x, seq_dimension=2):
+        seq_len = x.shape[seq_dimension]
+
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
+            self._seq_len_cached = seq_len
+            t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+
+            self._cos_cached = emb.cos()[None, None, :, :]
+            self._sin_cached = emb.sin()[None, None, :, :]
+
+        return self._cos_cached, self._sin_cached
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)
+
+        return (
+            apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached).to(dtype=q.dtype),
+            apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached).to(dtype=k.dtype),
+        )
+
+
+class EsmContactPredictionHead(nn.Module):
+    """Performs symmetrization, apc, and computes a logistic regression on the output features"""
+
+    def __init__(
+        self,
+        in_features: int,
+        bias=True,
+        eos_idx: int = 2,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.eos_idx = eos_idx
+        self.regression = nn.Linear(in_features, 1, bias)
+        self.activation = nn.Sigmoid()
+
+    def forward(self, tokens, attentions):
+        # remove eos token attentions
+        eos_mask = tokens.ne(self.eos_idx).to(attentions)
+        eos_mask = eos_mask.unsqueeze(1) * eos_mask.unsqueeze(2)
+        attentions = attentions * eos_mask[:, None, None, :, :]
+        attentions = attentions[..., :-1, :-1]
+        # remove cls token attentions
+        attentions = attentions[..., 1:, 1:]
+        batch_size, layers, heads, seqlen, _ = attentions.size()
+        attentions = attentions.view(batch_size, layers * heads, seqlen, seqlen)
+
+        # features: batch x channels x tokens x tokens (symmetric)
+        attentions = attentions.to(
+            self.regression.weight.device
+        )  # attentions always float32, may need to convert to float16
+        attentions = average_product_correct(symmetrize(attentions))
+        attentions = attentions.permute(0, 2, 3, 1)
+        return self.activation(self.regression(attentions).squeeze(3))
+
+
+class EsmEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        if config.emb_layer_norm_before:
+            self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.layer_norm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+        self.padding_idx = config.pad_token_id
+        if self.position_embedding_type == "absolute":
+            self.position_embeddings = nn.Embedding(
+                config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+            )
+        self.token_dropout = config.token_dropout
+        self.mask_token_id = config.mask_token_id
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # Note that if we want to support ESM-1 (not 1b!) in future then we need to support an
+        # embedding_scale factor here.
+        embeddings = inputs_embeds
+
+        # Matt: ESM has the option to handle masking in MLM in a slightly unusual way. If the token_dropout
+        # flag is False then it is handled in the same was as BERT/RoBERTa. If it is set to True, however,
+        # masked tokens are treated as if they were selected for input dropout and zeroed out.
+        # This "mask-dropout" is compensated for when masked tokens are not present, by scaling embeddings by
+        # a factor of (fraction of unmasked tokens during training) / (fraction of unmasked tokens in sample).
+        # This is analogous to the way that dropout layers scale down outputs during evaluation when not
+        # actually dropping out values (or, equivalently, scale up their un-dropped outputs in training).
+        if self.token_dropout and input_ids is not None:
+            embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
+            mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all ESM model training runs
+            src_lengths = attention_mask.sum(-1) if attention_mask is not None else input_ids.shape[1]
+            mask_ratio_observed = (input_ids == self.mask_token_id).sum(-1).float() / src_lengths
+            embeddings = (embeddings * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]).to(
+                embeddings.dtype
+            )
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = embeddings + position_embeddings
+
+        if self.layer_norm is not None:
+            embeddings = self.layer_norm(embeddings)
+        if attention_mask is not None:
+            embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)
+        # Matt: I think this line was copied incorrectly from BERT, disabling it for now.
+        # embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    # ESM applies relative position embeddings and we don't copy from Llama
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+
+    if hasattr(module, "position_embedding_type") and module.position_embedding_type in [
+        "relative_key",
+        "relative_key_query",
+    ]:
+        seq_length = query.shape[2]
+        position_ids_l = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(-1, 1)
+        position_ids_r = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(1, -1)
+        distance = position_ids_l - position_ids_r
+        positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1)
+        positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+        if module.position_embedding_type == "relative_key":
+            relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+        elif module.position_embedding_type == "relative_key_query":
+            relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding)
+            relative_position_scores = relative_position_scores_query + relative_position_scores_key
+
+        attn_weights = attn_weights + relative_position_scores
+
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class EsmSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = config.attention_probs_dropout_prob
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        self.rotary_embeddings = None
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        elif self.position_embedding_type == "rotary":
+            self.rotary_embeddings = RotaryEmbedding(dim=self.attention_head_size)
+
+        self.scaling = 1.0  # For BC we apply scaling before RoPE
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+        self.is_causal = self.is_decoder and not is_cross_attention
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        hidden_shape = (batch_size, seq_length, -1, self.attention_head_size)
+
+        query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+        key_layer = self.key(current_states).view(hidden_shape).transpose(1, 2)
+        value_layer = self.value(current_states).view(hidden_shape).transpose(1, 2)
+
+        # Matt: Our BERT model (which this code was derived from) scales attention logits down by sqrt(head_dim).
+        # ESM scales the query down by the same factor instead. Modulo numerical stability these are equivalent,
+        # but not when rotary embeddings get involved. Therefore, we scale the query here to match the original
+        # ESM code and fix rotary embeddings.
+        query_layer = query_layer * self.attention_head_size**-0.5
+
+        if self.position_embedding_type == "rotary":
+            query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.position_embedding_type in ["relative_key", "relative_key_query"]:
+                raise ValueError(
+                    f"ESM {self.config._attn_implementation} attention does not support {self.position_embedding_type} embeddings. "
+                    "Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`"
+                )
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        return attn_output, attn_weights
+
+
+class EsmSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class EsmAttention(nn.Module):
+    def __init__(self, config, layer_idx=None, is_cross_attention=False):
+        super().__init__()
+        self.self = EsmSelfAttention(config, layer_idx=layer_idx, is_cross_attention=is_cross_attention)
+        self.output = EsmSelfOutput(config)
+        self.pruned_heads = set()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        hidden_states_ln = self.LayerNorm(hidden_states)
+        attn_output, _ = self.self(
+            hidden_states_ln,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            **kwargs,
+        )
+        attn_output = self.output(attn_output, hidden_states)
+        return attn_output
+
+
+class EsmIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = gelu(hidden_states)
+        return hidden_states
+
+
+class EsmOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class EsmLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = EsmAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise RuntimeError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = EsmAttention(config, is_cross_attention=True)
+        self.intermediate = EsmIntermediate(config)
+        self.output = EsmOutput(config)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise AttributeError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated"
+                    " with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            attention_output = self.crossattention(
+                attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                **kwargs,
+            )
+
+        layer_output = self.feed_forward_chunk(attention_output)
+        return layer_output
+
+    def feed_forward_chunk(self, attention_output):
+        attention_output_ln = self.LayerNorm(attention_output)
+        intermediate_output = self.intermediate(attention_output_ln)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class EsmEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([EsmLayer(config) for _ in range(config.num_hidden_layers)])
+        self.emb_layer_norm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                **kwargs,
+            )
+
+        if self.emb_layer_norm_after:
+            hidden_states = self.emb_layer_norm_after(hidden_states)
+
+        return BaseModelOutputWithCrossAttentions(last_hidden_state=hidden_states)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class EsmPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class EsmPreTrainedModel(PreTrainedModel):
+    config: EsmConfig
+    base_model_prefix = "esm"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock", "EsmEmbeddings"]
+    _keys_to_ignore_on_load_unexpected = ["position_embeddings.weight"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": EsmLayer,
+        "attentions": [OutputRecorder(EsmSelfAttention, index=1, layer_name="attention")],
+        "cross_attentions": [
+            OutputRecorder(EsmSelfAttention, index=1, layer_name="crossattention"),
+        ],
+    }
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->EsmLMHead
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, EsmLMHead):
+            module.bias.data.zero_()
+
+    def get_output_embeddings(self):
+        # NOTE: get_output_embeddings() must return None to prevent accidental weight tying.
+        # See e.g. https://github.com/huggingface/transformers/pull/39339#discussion_r2219126400
+        return None
+
+
+@auto_docstring
+class EsmModel(EsmPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = EsmEmbeddings(config)
+        self.encoder = EsmEncoder(config)
+
+        self.pooler = EsmPooler(config) if add_pooling_layer else None
+
+        self.contact_head = EsmContactPredictionHead(
+            in_features=config.num_hidden_layers * config.num_attention_heads, bias=True
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+            )
+
+        if self.config._attn_implementation != "flash_attention_2":
+            batch_size, seq_length = inputs_embeds.shape[:-1]
+            if attention_mask is None:
+                attention_mask = torch.ones(((batch_size, seq_length)), device=inputs_embeds.device)
+
+            attention_mask: torch.Tensor = self.get_extended_attention_mask(
+                attention_mask, input_shape=(batch_size, seq_length)
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            **kwargs,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+        )
+
+    def predict_contacts(self, tokens, attention_mask):
+        attns = self(tokens, attention_mask=attention_mask, return_dict=True, output_attentions=True).attentions
+        attns = torch.stack(attns, dim=1)  # Matches the original model layout
+        # In the original model, attentions for padding tokens are completely zeroed out.
+        # This makes no difference most of the time because the other tokens won't attend to them,
+        # but it does for the contact prediction task, which takes attentions as input,
+        # so we have to mimic that here.
+        attns *= attention_mask.unsqueeze(1).unsqueeze(2).unsqueeze(3)
+        attns *= attention_mask.unsqueeze(1).unsqueeze(2).unsqueeze(4)
+        return self.contact_head(tokens, attns)
+
+
+@auto_docstring
+class EsmForMaskedLM(EsmPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.esm = EsmModel(config, add_pooling_layer=False)
+        self.lm_head = EsmLMHead(config)
+
+        self.init_weights()
+
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        outputs = self.esm(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            **kwargs,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(prediction_scores.device)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def predict_contacts(self, tokens, attention_mask):
+        return self.esm.predict_contacts(tokens, attention_mask=attention_mask)
+
+
+class EsmLMHead(nn.Module):
+    """ESM Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x) + self.bias
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """
+)
+class EsmForSequenceClassification(EsmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.esm = EsmModel(config, add_pooling_layer=False)
+        self.classifier = EsmClassificationHead(config)
+
+        self.init_weights()
+
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs = self.esm(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class EsmForTokenClassification(EsmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.esm = EsmModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+
+        outputs = self.esm(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class EsmClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
+    return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+    "EsmForMaskedLM",
+    "EsmForSequenceClassification",
+    "EsmForTokenClassification",
+    "EsmModel",
+    "EsmPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/modeling_esmfold.py b/phivenv/Lib/site-packages/transformers/models/esm/modeling_esmfold.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbff29fade87b7a656da760beb8dfda4a6a14830
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/modeling_esmfold.py
@@ -0,0 +1,2309 @@
+# coding=utf-8
+# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import sys
+from collections.abc import Sequence
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+
+from ...integrations.deepspeed import is_deepspeed_available
+from ...modeling_outputs import ModelOutput
+from ...utils import (
+    ContextManagers,
+    auto_docstring,
+    is_scipy_available,
+    logging,
+)
+from .modeling_esm import EsmModel, EsmPreTrainedModel
+from .openfold_utils import (
+    OFProtein,
+    Rigid,
+    Rotation,
+    atom14_to_atom37,
+    chunk_layer,
+    compute_predicted_aligned_error,
+    compute_tm,
+    frames_and_literature_positions_to_atom14_pos,
+    make_atom14_masks,
+    residue_constants,
+    to_pdb,
+    torsion_angles_to_frames,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`EsmForProteinFoldingOutput`].
+    """
+)
+class EsmForProteinFoldingOutput(ModelOutput):
+    r"""
+    frames (`torch.FloatTensor`):
+        Output frames.
+    sidechain_frames (`torch.FloatTensor`):
+        Output sidechain frames.
+    unnormalized_angles (`torch.FloatTensor`):
+        Predicted unnormalized backbone and side chain torsion angles.
+    angles (`torch.FloatTensor`):
+        Predicted backbone and side chain torsion angles.
+    positions (`torch.FloatTensor`):
+        Predicted positions of the backbone and side chain atoms.
+    states (`torch.FloatTensor`):
+        Hidden states from the protein folding trunk.
+    s_s (`torch.FloatTensor`):
+        Per-residue embeddings derived by concatenating the hidden states of each layer of the ESM-2 LM stem.
+    s_z (`torch.FloatTensor`):
+        Pairwise residue embeddings.
+    distogram_logits (`torch.FloatTensor`):
+        Input logits to the distogram used to compute residue distances.
+    lm_logits (`torch.FloatTensor`):
+        Logits output by the ESM-2 protein language model stem.
+    aatype (`torch.FloatTensor`):
+        Input amino acids (AlphaFold2 indices).
+    atom14_atom_exists (`torch.FloatTensor`):
+        Whether each atom exists in the atom14 representation.
+    residx_atom14_to_atom37 (`torch.FloatTensor`):
+        Mapping between atoms in the atom14 and atom37 representations.
+    residx_atom37_to_atom14 (`torch.FloatTensor`):
+        Mapping between atoms in the atom37 and atom14 representations.
+    atom37_atom_exists (`torch.FloatTensor`):
+        Whether each atom exists in the atom37 representation.
+    residue_index (`torch.FloatTensor`):
+        The index of each residue in the protein chain. Unless internal padding tokens are used, this will just be
+        a sequence of integers from 0 to `sequence_length`.
+    lddt_head (`torch.FloatTensor`):
+        Raw outputs from the lddt head used to compute plddt.
+    plddt (`torch.FloatTensor`):
+        Per-residue confidence scores. Regions of low confidence may indicate areas where the model's prediction is
+        uncertain, or where the protein structure is disordered.
+    ptm_logits (`torch.FloatTensor`):
+        Raw logits used for computing ptm.
+    ptm (`torch.FloatTensor`):
+        TM-score output representing the model's high-level confidence in the overall structure.
+    aligned_confidence_probs (`torch.FloatTensor`):
+        Per-residue confidence scores for the aligned structure.
+    predicted_aligned_error (`torch.FloatTensor`):
+        Predicted error between the model's prediction and the ground truth.
+    max_predicted_aligned_error (`torch.FloatTensor`):
+        Per-sample maximum predicted error.
+    """
+
+    frames: Optional[torch.FloatTensor] = None
+    sidechain_frames: Optional[torch.FloatTensor] = None
+    unnormalized_angles: Optional[torch.FloatTensor] = None
+    angles: Optional[torch.FloatTensor] = None
+    positions: Optional[torch.FloatTensor] = None
+    states: Optional[torch.FloatTensor] = None
+    s_s: Optional[torch.FloatTensor] = None
+    s_z: Optional[torch.FloatTensor] = None
+    distogram_logits: Optional[torch.FloatTensor] = None
+    lm_logits: Optional[torch.FloatTensor] = None
+    aatype: Optional[torch.FloatTensor] = None
+    atom14_atom_exists: Optional[torch.FloatTensor] = None
+    residx_atom14_to_atom37: Optional[torch.FloatTensor] = None
+    residx_atom37_to_atom14: Optional[torch.FloatTensor] = None
+    atom37_atom_exists: Optional[torch.FloatTensor] = None
+    residue_index: Optional[torch.FloatTensor] = None
+    lddt_head: Optional[torch.FloatTensor] = None
+    plddt: Optional[torch.FloatTensor] = None
+    ptm_logits: Optional[torch.FloatTensor] = None
+    ptm: Optional[torch.FloatTensor] = None
+    aligned_confidence_probs: Optional[torch.FloatTensor] = None
+    predicted_aligned_error: Optional[torch.FloatTensor] = None
+    max_predicted_aligned_error: Optional[torch.FloatTensor] = None
+
+
+def is_fp16_enabled(device_type):
+    # Autocast world
+    autocast_dtype = (
+        torch.get_autocast_dtype(device_type)
+        if hasattr(torch, "get_autocast_dtype")
+        else torch.get_autocast_gpu_dtype()
+    )
+    fp16_enabled = autocast_dtype == torch.float16
+    fp16_enabled = fp16_enabled and torch.is_autocast_enabled()
+
+    return fp16_enabled
+
+
+def is_deepspeed_initialized():
+    if is_deepspeed_available():
+        return False
+    else:
+        try:
+            import deepspeed
+
+            # This is not available in all DeepSpeed versions.
+            return deepspeed.utils.is_initialized()
+        except Exception:
+            return False
+
+
+def collate_dense_tensors(samples: list[torch.Tensor], pad_v: float = 0) -> torch.Tensor:
+    """
+    Takes a list of tensors with the following dimensions:
+        [(d_11, ..., d_1K),
+         (d_21, ..., d_2K), ..., (d_N1, ..., d_NK)]
+    and stack + pads them into a single tensor of:
+    (N, max_i=1,N { d_i1 }, ..., max_i=1,N {diK})
+    """
+    if len(samples) == 0:
+        return torch.Tensor()
+    if len({x.dim() for x in samples}) != 1:
+        raise RuntimeError(f"Samples has varying dimensions: {[x.dim() for x in samples]}")
+    (device,) = tuple({x.device for x in samples})  # assumes all on same device
+    max_shape = [max(lst) for lst in zip(*[x.shape for x in samples])]
+    result = torch.empty(len(samples), *max_shape, dtype=samples[0].dtype, device=device)
+    result.fill_(pad_v)
+    for i in range(len(samples)):
+        result_i = result[i]
+        t = samples[i]
+        result_i[tuple(slice(0, k) for k in t.shape)] = t
+    return result
+
+
+def flatten_final_dims(t: torch.Tensor, no_dims: int):
+    return t.reshape(t.shape[:-no_dims] + (-1,))
+
+
+def permute_final_dims(tensor: torch.Tensor, inds: list[int]):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+def dict_multimap(fn, dicts):
+    first = dicts[0]
+    new_dict = {}
+    for k, v in first.items():
+        all_v = [d[k] for d in dicts]
+        if isinstance(v, dict):
+            new_dict[k] = dict_multimap(fn, all_v)
+        else:
+            new_dict[k] = fn(all_v)
+
+    return new_dict
+
+
+def trunc_normal_init_(weights, scale=1.0, fan="fan_in"):
+    shape = weights.shape
+    scale = scale / max(1, shape[1])
+
+    if not is_scipy_available():
+        logger.warning(
+            "This init requires scipy, but scipy was not found, default to an approximation that might not be"
+            " equivalent."
+        )
+        std = math.sqrt(scale)
+        torch.nn.init.normal_(weights, std=std).clamp(min=0.0, max=2.0 * std)
+
+    else:
+        from scipy.stats import truncnorm
+
+        std = math.sqrt(scale) / truncnorm.std(a=-2, b=2, loc=0, scale=1)
+        samples = truncnorm.rvs(a=-2, b=2, loc=0, scale=std, size=weights.numel())
+        samples = np.reshape(samples, shape)
+        weights.copy_(torch.tensor(samples, device=weights.device))
+
+
+def ipa_point_weights_init_(weights):
+    with torch.no_grad():
+        softplus_inverse_1 = 0.541324854612918
+        weights.fill_(softplus_inverse_1)
+
+
+class EsmFoldLinear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just like torch.nn.Linear.
+
+    Implements the initializers in 1.11.4, plus some additional ones found in the code.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        bias: bool = True,
+        init: str = "default",
+        init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
+    ):
+        """
+        Args:
+            in_dim:
+                The final dimension of inputs to the layer
+            out_dim:
+                The final dimension of layer outputs
+            bias:
+                Whether to learn an additive bias. True by default
+            init:
+                The initializer to use. Choose from:
+
+                "default": LeCun fan-in truncated normal initialization "relu": He initialization w/ truncated normal
+                distribution "glorot": Fan-average Glorot uniform initialization "gating": Weights=0, Bias=1 "normal":
+                Normal initialization with std=1/sqrt(fan_in) "final": Weights=0, Bias=0
+
+                Overridden by init_fn if the latter is not None.
+            init_fn:
+                A custom initializer taking weight and bias as inputs. Overrides init if not None.
+        """
+        super().__init__(in_dim, out_dim, bias=bias)
+
+        if bias:
+            with torch.no_grad():
+                self.bias.fill_(0)
+        self.init = init
+        self.init_fn = init_fn
+
+        if init not in ["default", "relu", "glorot", "gating", "normal", "final"]:
+            raise ValueError("Invalid init string.")
+
+
+class EsmFoldLayerNorm(nn.Module):
+    def __init__(self, c_in, eps=1e-5):
+        super().__init__()
+
+        self.c_in = (c_in,)
+        self.eps = eps
+
+        self.weight = nn.Parameter(torch.ones(c_in))
+        self.bias = nn.Parameter(torch.zeros(c_in))
+
+    def forward(self, x):
+        d = x.dtype
+        if d is torch.bfloat16 and not is_deepspeed_initialized():
+            with torch.cuda.amp.autocast(enabled=False):
+                out = nn.functional.layer_norm(x, self.c_in, self.weight.to(dtype=d), self.bias.to(dtype=d), self.eps)
+        else:
+            out = nn.functional.layer_norm(x, self.c_in, self.weight, self.bias, self.eps)
+
+        return out
+
+
+@torch.jit.ignore
+def softmax_no_cast(t: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """
+    Softmax, but without automatic casting to fp32 when the input is of type bfloat16
+    """
+    d = t.dtype
+    if d is torch.bfloat16 and not is_deepspeed_initialized():
+        with torch.cuda.amp.autocast(enabled=False):
+            s = torch.nn.functional.softmax(t, dim=dim)
+    else:
+        s = torch.nn.functional.softmax(t, dim=dim)
+
+    return s
+
+
+class EsmFoldAttention(nn.Module):
+    """
+    Standard multi-head attention using AlphaFold's default layer initialization. Allows multiple bias vectors.
+    """
+
+    def __init__(
+        self,
+        c_q: int,
+        c_k: int,
+        c_v: int,
+        c_hidden: int,
+        no_heads: int,
+        gating: bool = True,
+    ):
+        """
+        Args:
+            c_q:
+                Input dimension of query data
+            c_k:
+                Input dimension of key data
+            c_v:
+                Input dimension of value data
+            c_hidden:
+                Per-head hidden dimension
+            no_heads:
+                Number of attention heads
+            gating:
+                Whether the output should be gated using query data
+        """
+        super().__init__()
+
+        self.c_q = c_q
+        self.c_k = c_k
+        self.c_v = c_v
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.gating = gating
+
+        # DISCREPANCY: c_hidden is not the per-head channel dimension, as
+        # stated in the supplement, but the overall channel dimension.
+
+        self.linear_q = EsmFoldLinear(self.c_q, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_k = EsmFoldLinear(self.c_k, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_v = EsmFoldLinear(self.c_v, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_o = EsmFoldLinear(self.c_hidden * self.no_heads, self.c_q, init="final")
+
+        self.linear_g = None
+        if self.gating:
+            self.linear_g = EsmFoldLinear(self.c_q, self.c_hidden * self.no_heads, init="gating")
+
+        self.sigmoid = nn.Sigmoid()
+
+    def _prep_qkv(self, q_x: torch.Tensor, kv_x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # [*, Q/K/V, H * C_hidden]
+        q = self.linear_q(q_x)
+        k = self.linear_k(kv_x)
+        v = self.linear_v(kv_x)
+
+        # [*, Q/K, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.no_heads, -1))
+        k = k.view(k.shape[:-1] + (self.no_heads, -1))
+        v = v.view(v.shape[:-1] + (self.no_heads, -1))
+
+        # [*, H, Q/K, C_hidden]
+        q = q.transpose(-2, -3)
+        k = k.transpose(-2, -3)
+        v = v.transpose(-2, -3)
+
+        q /= math.sqrt(self.c_hidden)
+
+        return q, k, v
+
+    def _wrap_up(self, o: torch.Tensor, q_x: torch.Tensor) -> torch.Tensor:
+        if self.linear_g is not None:
+            g = self.sigmoid(self.linear_g(q_x))
+
+            # [*, Q, H, C_hidden]
+            g = g.view(g.shape[:-1] + (self.no_heads, -1))
+            o = o * g
+
+        # [*, Q, H * C_hidden]
+        o = flatten_final_dims(o, 2)
+
+        # [*, Q, C_q]
+        o = self.linear_o(o)
+
+        return o
+
+    def forward(
+        self,
+        q_x: torch.Tensor,
+        kv_x: torch.Tensor,
+        biases: Optional[list[torch.Tensor]] = None,
+        use_memory_efficient_kernel: bool = False,
+        use_lma: bool = False,
+        lma_q_chunk_size: int = 1024,
+        lma_kv_chunk_size: int = 4096,
+        use_flash: bool = False,
+        flash_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            q_x:
+                [*, Q, C_q] query data
+            kv_x:
+                [*, K, C_k] key data
+            biases:
+                List of biases that broadcast to [*, H, Q, K]
+            use_memory_efficient_kernel:
+                Whether to use a custom memory-efficient attention kernel. This should be the default choice for most.
+                If none of the "use_<...>" flags are True, a stock PyTorch implementation is used instead
+            use_lma:
+                Whether to use low-memory attention (Staats & Rabe 2021). If none of the "use_<...>" flags are True, a
+                stock PyTorch implementation is used instead
+            lma_q_chunk_size:
+                Query chunk size (for LMA)
+            lma_kv_chunk_size:
+                Key/Value chunk size (for LMA)
+        Returns
+            [*, Q, C_q] attention update
+        """
+        if use_lma and (lma_q_chunk_size is None or lma_kv_chunk_size is None):
+            raise ValueError("If use_lma is specified, lma_q_chunk_size and lma_kv_chunk_size must be provided")
+
+        if use_flash and biases is not None:
+            raise ValueError("use_flash is incompatible with the bias option. For masking, use flash_mask instead")
+
+        attn_options = [use_memory_efficient_kernel, use_lma, use_flash]
+        if sum(attn_options) > 1:
+            raise ValueError("Choose at most one alternative attention algorithm")
+
+        if biases is None:
+            biases = []
+
+        # [*, H, Q/K, C_hidden]
+        query, key, value = self._prep_qkv(q_x, kv_x)
+        key = permute_final_dims(key, (1, 0))
+
+        # [*, H, Q, K]
+        output = torch.matmul(query, key)
+        for b in biases:
+            output += b
+        output = softmax_no_cast(output, -1)
+
+        # [*, H, Q, C_hidden]
+        output = torch.matmul(output, value)
+        output = output.transpose(-2, -3)
+        output = self._wrap_up(output, q_x)
+
+        return output
+
+
+class EsmFoldTriangleAttention(nn.Module):
+    def __init__(self, c_in, c_hidden, no_heads, starting=True, inf=1e9):
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Overall hidden channel dimension (not per-head)
+            no_heads:
+                Number of attention heads
+        """
+        super().__init__()
+
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.starting = starting
+        self.inf = inf
+
+        self.layer_norm = LayerNorm(self.c_in)
+
+        self.linear = EsmFoldLinear(c_in, self.no_heads, bias=False, init="normal")
+
+        self.mha = EsmFoldAttention(self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads)
+
+    @torch.jit.ignore
+    def _chunk(
+        self,
+        x: torch.Tensor,
+        biases: list[torch.Tensor],
+        chunk_size: int,
+        use_memory_efficient_kernel: bool = False,
+        use_lma: bool = False,
+        inplace_safe: bool = False,
+    ) -> torch.Tensor:
+        "triangle! triangle!"
+        mha_inputs = {
+            "q_x": x,
+            "kv_x": x,
+            "biases": biases,
+        }
+
+        return chunk_layer(
+            partial(self.mha, use_memory_efficient_kernel=use_memory_efficient_kernel, use_lma=use_lma),
+            mha_inputs,
+            chunk_size=chunk_size,
+            no_batch_dims=len(x.shape[:-2]),
+            _out=x if inplace_safe else None,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+        use_memory_efficient_kernel: bool = False,
+        use_lma: bool = False,
+        inplace_safe: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, I, J, C_in] input tensor (e.g. the pair representation)
+        Returns:
+            [*, I, J, C_in] output tensor
+        """
+        if mask is None:
+            # [*, I, J]
+            mask = x.new_ones(
+                x.shape[:-1],
+            )
+
+        if not self.starting:
+            x = x.transpose(-2, -3)
+            mask = mask.transpose(-1, -2)
+
+        # [*, I, J, C_in]
+        x = self.layer_norm(x)
+
+        # [*, I, 1, 1, J]
+        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
+
+        # [*, H, I, J]
+        triangle_bias = permute_final_dims(self.linear(x), (2, 0, 1))
+
+        # [*, 1, H, I, J]
+        triangle_bias = triangle_bias.unsqueeze(-4)
+
+        biases = [mask_bias, triangle_bias]
+
+        if chunk_size is not None:
+            x = self._chunk(
+                x,
+                biases,
+                chunk_size,
+                use_memory_efficient_kernel=use_memory_efficient_kernel,
+                use_lma=use_lma,
+                inplace_safe=inplace_safe,
+            )
+        else:
+            x = self.mha(
+                q_x=x, kv_x=x, biases=biases, use_memory_efficient_kernel=use_memory_efficient_kernel, use_lma=use_lma
+            )
+
+        if not self.starting:
+            x = x.transpose(-2, -3)
+
+        return x
+
+
+class EsmFoldTriangleMultiplicativeUpdate(nn.Module):
+    """
+    Implements Algorithms 11 and 12.
+    """
+
+    def __init__(self, config, _outgoing=True):
+        super().__init__()
+        c_hidden = config.pairwise_state_dim
+        self._outgoing = _outgoing
+
+        self.linear_a_p = EsmFoldLinear(c_hidden, c_hidden)
+        self.linear_a_g = EsmFoldLinear(c_hidden, c_hidden, init="gating")
+        self.linear_b_p = EsmFoldLinear(c_hidden, c_hidden)
+        self.linear_b_g = EsmFoldLinear(c_hidden, c_hidden, init="gating")
+        self.linear_g = EsmFoldLinear(c_hidden, c_hidden, init="gating")
+        self.linear_z = EsmFoldLinear(c_hidden, c_hidden, init="final")
+
+        self.layer_norm_in = LayerNorm(c_hidden)
+        self.layer_norm_out = LayerNorm(c_hidden)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def _combine_projections(
+        self, a: torch.Tensor, b: torch.Tensor, _inplace_chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        if self._outgoing:
+            a = permute_final_dims(a, (2, 0, 1))
+            b = permute_final_dims(b, (2, 1, 0))
+        else:
+            a = permute_final_dims(a, (2, 1, 0))
+            b = permute_final_dims(b, (2, 0, 1))
+
+        if _inplace_chunk_size is not None:
+            # To be replaced by torch vmap
+            for i in range(0, a.shape[-3], _inplace_chunk_size):
+                a_chunk = a[..., i : i + _inplace_chunk_size, :, :]
+                b_chunk = b[..., i : i + _inplace_chunk_size, :, :]
+                a[..., i : i + _inplace_chunk_size, :, :] = torch.matmul(
+                    a_chunk,
+                    b_chunk,
+                )
+
+            p = a
+        else:
+            p = torch.matmul(a, b)
+
+        return permute_final_dims(p, (1, 2, 0))
+
+    def _inference_forward(
+        self,
+        z: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        inplace_chunk_size: Optional[int] = None,
+        with_add: bool = True,
+    ):
+        """
+        Args:
+            z:
+                A [*, N, N, C_z] pair representation
+            mask:
+                A [*, N, N] pair mask
+            inplace_chunk_size:
+                Size of chunks used in the main computation. Increase to trade memory for speed.
+            with_add:
+                If True, z is overwritten with (z + update). Otherwise, it is overwritten with (update).
+        Returns:
+            A reference to the overwritten z
+
+        More memory-efficient, inference-only version of the forward function. Uses in-place operations, fusion of the
+        addition that happens after this module in the Evoformer, a smidge of recomputation, and a cache of overwritten
+        values to lower peak memory consumption of this module from 5x the size of the input tensor z to 2.5x its size.
+        Useful for inference on extremely long sequences.
+
+        It works as follows. We will make reference to variables used in the default forward implementation below.
+        Naively, triangle multiplication attention requires the manifestation of 5 tensors the size of z: 1) z, the
+        "square" input tensor, 2) a, the first projection of z, 3) b, the second projection of b, 4) g, a z-sized mask,
+        and 5) a z-sized tensor for intermediate computations. For large N, this is prohibitively expensive; for
+        N=4000, for example, z is more than 8GB alone. To avoid this problem, we compute b, g, and all intermediate
+        tensors in small chunks, noting that the chunks required to compute a chunk of the output depend only on the
+        tensor a and corresponding vertical and horizontal chunks of z. This suggests an algorithm that loops over
+        pairs of chunks of z: hereafter "columns" and "rows" of z, even though each "column" and "row" in fact contains
+        inplace_chunk_size contiguous true columns and rows of z. Writing output chunks to a new tensor would bring
+        total memory consumption down to 3x the size of z. However, more memory can be saved by writing output chunks
+        directly to z in-place. WLOG, we choose to write output chunks vertically, overwriting the ith "column" of z at
+        the end of the ith iteration of the main loop. Despite this overwriting, the ith column is always one column
+        ahead of previously overwritten columns and can be recovered directly from z. After the first iteration,
+        however, the ith row of z is always at least partially overwritten. For this reason, we introduce the z-cache,
+        a tensor one-half the size of z. The z-cache initially contains the left half (2nd and 3rd quadrants) of z. For
+        0 < i < N/2, the missing left part of the ith row of z is recovered from this cache at the beginning of the ith
+        iteration. Once i exceeds n/2, the cache is "reoriented" to encompass the 3rd and 4th quadrants of z instead.
+        Though the 3rd quadrant of the original z is entirely overwritten at this point, it can be recovered from the
+        z-cache itself. Thereafter, the ith row of z can be recovered in its entirety from the reoriented z-cache.
+        After the final iteration, z has been completely overwritten and contains the triangular multiplicative update.
+        If with_add is True, it instead contains the sum of z and the triangular multiplicative update. In either case,
+        peak memory consumption is just 2.5x the size of z, disregarding memory used for chunks and other small
+        variables.
+        """
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+
+        mask = mask.unsqueeze(-1)
+
+        def compute_projection_helper(pair, mask, a=True):
+            if a:
+                linear_g = self.linear_a_g
+                linear_p = self.linear_a_p
+            else:
+                linear_g = self.linear_b_g
+                linear_p = self.linear_b_p
+
+            pair = self.layer_norm_in(pair)
+            p = linear_g(pair)
+            p.sigmoid_()
+            p *= linear_p(pair)
+            p *= mask
+            p = permute_final_dims(p, (2, 0, 1))
+            return p
+
+        def compute_projection(pair, mask, a=True, chunked=True):
+            need_transpose = self._outgoing ^ a
+            if not chunked:
+                p = compute_projection_helper(pair, mask, a)
+                if need_transpose:
+                    p = p.transpose(-1, -2)
+            else:
+                # This computation is chunked so as not to exceed our 2.5x
+                # budget with a large intermediate tensor
+                linear_g = self.linear_a_g if a else self.linear_b_g
+                c = linear_g.bias.shape[-1]
+                out_shape = pair.shape[:-3] + (c,) + pair.shape[-3:-1]
+                p = pair.new_zeros(out_shape)
+                for i in range(0, pair.shape[-3], inplace_chunk_size):
+                    pair_chunk = pair[..., i : i + inplace_chunk_size, :, :]
+                    pair_chunk = compute_projection_helper(
+                        pair[..., i : i + inplace_chunk_size, :, :],
+                        mask[..., i : i + inplace_chunk_size, :, :],
+                        a,
+                    )
+                    if need_transpose:
+                        pair_chunk = pair_chunk.transpose(-1, -2)
+                        p[..., i : i + inplace_chunk_size] = pair_chunk
+                    else:
+                        p[..., i : i + inplace_chunk_size, :] = pair_chunk
+
+                    del pair_chunk
+
+            return p
+
+        # We start by fully manifesting a. In addition to the input, this
+        # brings total memory consumption to 2x z (disregarding size of chunks)
+        # [*, N, N, c]
+        a = compute_projection(z, mask, True, chunked=True)
+
+        if inplace_chunk_size is not None:
+            n = a.shape[-1]
+            half_n = n // 2 + n % 2
+            row_dim = -3
+            col_dim = -2
+            b_chunk_dim = row_dim if self._outgoing else col_dim
+
+            def empty_slicer(t):
+                return [slice(None) for _ in t.shape]
+
+            def slice_tensor(t, start, end, dim):
+                # Slices start:end from the dim dimension of t
+                s = empty_slicer(t)
+                s[dim] = slice(start, end)
+                return t[s]
+
+            def flip_z_cache_(z_cache, z):
+                # "Reorient" the z_cache (see below), filling it with quadrants
+                # 3---recovered from the z_cache---and 4---recovered from z---
+                # of the input tensor z.
+                quadrant_3 = slice_tensor(z_cache, half_n, None, row_dim)
+                z_cache = z_cache.transpose(row_dim, col_dim)
+
+                # If n is odd, we need to shrink the z_cache by one row
+                z_cache = z_cache[..., : (n // 2), :, :]
+
+                # Move the 3rd quadrant of z into the
+                first_half_slicer = empty_slicer(z_cache)
+                first_half_slicer[col_dim] = slice(0, half_n)
+                z_cache[first_half_slicer] = quadrant_3
+
+                # Get the fourth quadrant of z
+                quadrant_4 = slice_tensor(z, half_n, None, row_dim)
+                quadrant_4 = slice_tensor(quadrant_4, half_n, None, col_dim)
+
+                # Insert said quadrant into the rotated z-cache
+                quadrant_3_slicer = empty_slicer(z_cache)
+                quadrant_3_slicer[col_dim] = slice(half_n, None)
+
+                z_cache[quadrant_3_slicer] = quadrant_4
+
+                return z_cache
+
+            # Initialize the z cache to the left half of z.
+            z_cache_shape = list(z.shape)
+            z_cache_shape[col_dim] = half_n
+            z_cache = z.new_zeros(z_cache_shape)
+            z_cache_slicer = empty_slicer(z_cache)
+            z_cache_slicer[col_dim] = slice(0, half_n)
+            z_cache.copy_(z[z_cache_slicer])
+            z_cache_rotated = False
+
+            # We need to reorient the z-cache at the halfway point, and we
+            # don't want a single chunk to straddle that point. We contract one
+            # of the chunks in the middle to address that problem.
+            i_range = list(range(0, half_n, inplace_chunk_size))
+            initial_offsets = [i_2 - i_1 for i_1, i_2 in zip(i_range, i_range[1:] + [half_n])]
+            after_half = list(range(half_n, n, inplace_chunk_size))
+            after_half_offsets = [inplace_chunk_size for _ in after_half]
+            combined_range_with_offsets = zip(i_range + after_half, initial_offsets + after_half_offsets)
+            for i, offset in combined_range_with_offsets:
+                if not z_cache_rotated and i >= half_n:
+                    z_cache = flip_z_cache_(z_cache, z)
+                    z_cache_rotated = True
+
+                z_chunk_b = slice_tensor(z, i, i + offset, b_chunk_dim)
+                mask_chunk = slice_tensor(mask, i, i + offset, b_chunk_dim)
+
+                z_chunk_b = z_chunk_b.clone()
+                if b_chunk_dim == col_dim:
+                    z_chunk_b = slice_tensor(z, i, i + offset, col_dim)
+                else:  # b_chunk_dim == row_dim
+                    # In this case, the b-dimension (b_chunk_dim) is partially
+                    # overwritten at the end of each iteration. We need to
+                    # restore the missing component from the z-cache.
+                    if not z_cache_rotated:
+                        z_chunk_slicer = empty_slicer(z_chunk_b)
+                        z_chunk_slicer[col_dim] = slice(0, half_n)
+                        z_chunk_b[z_chunk_slicer] = slice_tensor(z_cache, i, i + offset, row_dim)
+                    else:
+                        z_cache_offset = i - half_n
+                        z_chunk_b = slice_tensor(z_cache, z_cache_offset, z_cache_offset + offset, row_dim)
+
+                b_chunk = compute_projection(z_chunk_b, mask_chunk, a=False, chunked=False)
+                del z_chunk_b
+
+                x_chunk = torch.matmul(a, b_chunk)
+                x_chunk = permute_final_dims(x_chunk, (1, 2, 0))
+                x_chunk = self.layer_norm_out(x_chunk)
+                x_chunk = self.linear_z(x_chunk)
+
+                # The g dimension (col_dim) is parallel to and ahead of the
+                # overwrites in z. We can extract the g chunk normally.
+                z_chunk_g = slice_tensor(z, i, i + offset, col_dim)
+                g_chunk = self.linear_g(self.layer_norm_in(z_chunk_g))
+                g_chunk.sigmoid_()
+                del z_chunk_g
+
+                x_chunk *= g_chunk
+
+                # Write the columns into z in-place
+                z_slicer = empty_slicer(z)
+                z_slicer[col_dim] = slice(i, i + offset)
+                if with_add:
+                    z[z_slicer] += x_chunk
+                else:
+                    z[z_slicer] = x_chunk
+        else:
+            b = compute_projection(z, mask, False, False)
+            x = torch.matmul(a, b)
+            x = self.layer_norm_out(x)
+            x = self.linear_z(x)
+            g = self.linear_g(z)
+            g.sigmoid_()
+            x *= g
+            if with_add:
+                z += x
+            else:
+                z = x
+
+        return z
+
+    def forward(
+        self,
+        z: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        inplace_safe: bool = False,
+        _add_with_inplace: bool = False,
+        _inplace_chunk_size: Optional[int] = 256,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, N_res, N_res, C_z] input tensor
+            mask:
+                [*, N_res, N_res] input mask
+        Returns:
+            [*, N_res, N_res, C_z] output tensor
+        """
+        if inplace_safe:
+            x = self._inference_forward(
+                z,
+                mask,
+                inplace_chunk_size=_inplace_chunk_size,
+                with_add=_add_with_inplace,
+            )
+            return x
+
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+
+        mask = mask.unsqueeze(-1)
+
+        z = self.layer_norm_in(z)
+        a = mask
+        a = a * self.sigmoid(self.linear_a_g(z))
+        a = a * self.linear_a_p(z)
+        b = mask
+        b = b * self.sigmoid(self.linear_b_g(z))
+        b = b * self.linear_b_p(z)
+
+        device_type = a.device.type if a.device.type != "mps" else "cpu"
+        if is_fp16_enabled(device_type):
+            with torch.autocast(device_type=device_type, enabled=False):
+                x = self._combine_projections(a.float(), b.float())
+        else:
+            x = self._combine_projections(a, b)
+
+        del a, b
+        x = self.layer_norm_out(x)
+        x = self.linear_z(x)
+        g = self.sigmoid(self.linear_g(z))
+        x = x * g
+
+        return x
+
+
+class EsmFoldPreTrainedModel(EsmPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    # Subclass `EsMPreTrainedModel` to deal with special init
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, EsmFoldLinear):
+            with torch.no_grad():
+                if module.init_fn is not None:
+                    module.init_fn(module.weight, module.bias)
+                elif module.init == "default":
+                    trunc_normal_init_(module.weight, scale=1.0)
+                elif module.init == "relu":
+                    trunc_normal_init_(module.weight, scale=2.0)
+                elif module.init == "glorot":
+                    nn.init.xavier_uniform_(module.weight, gain=1)
+                elif module.init == "gating":
+                    module.weight.fill_(0.0)
+                    if module.bias:
+                        module.bias.fill_(1.0)
+                elif module.init == "normal":
+                    torch.nn.init.kaiming_normal_(module.weight, nonlinearity="linear")
+                elif module.init == "final":
+                    module.weight.fill_(0.0)
+        elif isinstance(module, EsmFoldInvariantPointAttention):
+            ipa_point_weights_init_(module.head_weights)
+        elif isinstance(module, EsmFoldTriangularSelfAttentionBlock):
+            torch.nn.init.zeros_(module.tri_mul_in.linear_z.weight)
+            torch.nn.init.zeros_(module.tri_mul_in.linear_z.bias)
+            torch.nn.init.zeros_(module.tri_mul_out.linear_z.weight)
+            torch.nn.init.zeros_(module.tri_mul_out.linear_z.bias)
+            torch.nn.init.zeros_(module.tri_att_start.mha.linear_o.weight)
+            torch.nn.init.zeros_(module.tri_att_start.mha.linear_o.bias)
+            torch.nn.init.zeros_(module.tri_att_end.mha.linear_o.weight)
+            torch.nn.init.zeros_(module.tri_att_end.mha.linear_o.bias)
+
+            torch.nn.init.zeros_(module.sequence_to_pair.o_proj.weight)
+            torch.nn.init.zeros_(module.sequence_to_pair.o_proj.bias)
+            torch.nn.init.zeros_(module.pair_to_sequence.linear.weight)
+            torch.nn.init.zeros_(module.seq_attention.o_proj.weight)
+            torch.nn.init.zeros_(module.seq_attention.o_proj.bias)
+            torch.nn.init.zeros_(module.mlp_seq.mlp[-2].weight)
+            torch.nn.init.zeros_(module.mlp_seq.mlp[-2].bias)
+            torch.nn.init.zeros_(module.mlp_pair.mlp[-2].weight)
+            torch.nn.init.zeros_(module.mlp_pair.mlp[-2].bias)
+        else:
+            super()._init_weights(module)
+
+
+class EsmFoldSelfAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads, head_width, gated=False):
+        super().__init__()
+        assert embed_dim == num_heads * head_width
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_width = head_width
+
+        self.proj = nn.Linear(embed_dim, embed_dim * 3, bias=False)
+        self.o_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.gated = gated
+        if gated:
+            self.g_proj = nn.Linear(embed_dim, embed_dim)
+            torch.nn.init.zeros_(self.g_proj.weight)
+            torch.nn.init.ones_(self.g_proj.bias)
+
+        self.rescale_factor = self.head_width**-0.5
+
+        torch.nn.init.zeros_(self.o_proj.bias)
+
+    def forward(self, x, mask=None, bias=None, indices=None):
+        """
+        Basic self attention with optional mask and external pairwise bias. To handle sequences of different lengths,
+        use mask.
+
+        Inputs:
+            x: batch of input sequences (.. x L x C) mask: batch of boolean masks where 1=valid, 0=padding position (..
+            x L_k) bias: batch of scalar pairwise attention biases (.. x Lq x Lk x num_heads)
+
+        Outputs:
+          sequence projection (B x L x embed_dim), attention maps (B x L x L x num_heads)
+        """
+
+        t = self.proj(x).view(*x.shape[:2], self.num_heads, -1)
+        t = t.permute(0, 2, 1, 3)
+        q, k, v = t.chunk(3, dim=-1)
+
+        q = self.rescale_factor * q
+        a = torch.einsum("...qc,...kc->...qk", q, k)
+
+        # Add external attention bias.
+        if bias is not None:
+            a = a + bias.permute(0, 3, 1, 2)
+
+        # Do not attend to padding tokens.
+        if mask is not None:
+            mask = mask[:, None, None]
+            a = a.masked_fill(mask == False, -np.inf)  # noqa: E712
+
+        a = nn.functional.softmax(a, dim=-1)
+
+        y = torch.einsum("...hqk,...hkc->...qhc", a, v)
+        y = y.reshape(*y.shape[:2], -1)
+
+        if self.gated:
+            y = self.g_proj(x).sigmoid() * y
+        y = self.o_proj(y)
+
+        return y, a.permute(0, 3, 1, 2)
+
+
+class EsmFoldDropout(nn.Module):
+    """
+    Implementation of dropout with the ability to share the dropout mask along a particular dimension.
+    """
+
+    def __init__(self, r: float, batch_dim: Union[int, list[int]]):
+        super().__init__()
+
+        self.r = r
+        if isinstance(batch_dim, int):
+            batch_dim = [batch_dim]
+        self.batch_dim = batch_dim
+        self.dropout = nn.Dropout(self.r)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shape = list(x.shape)
+        if self.batch_dim is not None:
+            for bd in self.batch_dim:
+                shape[bd] = 1
+        return x * self.dropout(x.new_ones(shape))
+
+
+class EsmFoldSequenceToPair(nn.Module):
+    def __init__(self, sequence_state_dim, inner_dim, pairwise_state_dim):
+        super().__init__()
+
+        self.layernorm = nn.LayerNorm(sequence_state_dim)
+        self.proj = nn.Linear(sequence_state_dim, inner_dim * 2, bias=True)
+        self.o_proj = nn.Linear(2 * inner_dim, pairwise_state_dim, bias=True)
+
+        torch.nn.init.zeros_(self.proj.bias)
+        torch.nn.init.zeros_(self.o_proj.bias)
+
+    def forward(self, sequence_state):
+        """
+        Inputs:
+          sequence_state: B x L x sequence_state_dim
+
+        Output:
+          pairwise_state: B x L x L x pairwise_state_dim
+
+        Intermediate state:
+          B x L x L x 2*inner_dim
+        """
+
+        assert len(sequence_state.shape) == 3
+
+        s = self.layernorm(sequence_state)
+        s = self.proj(s)
+        q, k = s.chunk(2, dim=-1)
+
+        prod = q[:, None, :, :] * k[:, :, None, :]
+        diff = q[:, None, :, :] - k[:, :, None, :]
+
+        x = torch.cat([prod, diff], dim=-1)
+        x = self.o_proj(x)
+
+        return x
+
+
+class EsmFoldPairToSequence(nn.Module):
+    def __init__(self, pairwise_state_dim, num_heads):
+        super().__init__()
+
+        self.layernorm = nn.LayerNorm(pairwise_state_dim)
+        self.linear = nn.Linear(pairwise_state_dim, num_heads, bias=False)
+
+    def forward(self, pairwise_state):
+        """
+        Inputs:
+          pairwise_state: B x L x L x pairwise_state_dim
+
+        Output:
+          pairwise_bias: B x L x L x num_heads
+        """
+        assert len(pairwise_state.shape) == 4
+        z = self.layernorm(pairwise_state)
+        pairwise_bias = self.linear(z)
+        return pairwise_bias
+
+
+class EsmFoldResidueMLP(nn.Module):
+    def __init__(self, embed_dim, inner_dim, dropout=0):
+        super().__init__()
+
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(embed_dim),
+            nn.Linear(embed_dim, inner_dim),
+            nn.ReLU(),
+            nn.Linear(inner_dim, embed_dim),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        return x + self.mlp(x)
+
+
+class EsmFoldTriangularSelfAttentionBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        sequence_state_dim = config.sequence_state_dim
+        pairwise_state_dim = config.pairwise_state_dim
+        sequence_num_heads = sequence_state_dim // config.sequence_head_width
+        pairwise_num_heads = pairwise_state_dim // config.pairwise_head_width
+
+        self.layernorm_1 = nn.LayerNorm(sequence_state_dim)
+
+        self.sequence_to_pair = EsmFoldSequenceToPair(sequence_state_dim, pairwise_state_dim // 2, pairwise_state_dim)
+        self.pair_to_sequence = EsmFoldPairToSequence(pairwise_state_dim, sequence_num_heads)
+
+        self.seq_attention = EsmFoldSelfAttention(
+            sequence_state_dim, sequence_num_heads, config.sequence_head_width, gated=True
+        )
+        self.tri_mul_out = EsmFoldTriangleMultiplicativeUpdate(config, _outgoing=True)
+        self.tri_mul_in = EsmFoldTriangleMultiplicativeUpdate(config, _outgoing=False)
+
+        self.tri_att_start = EsmFoldTriangleAttention(
+            pairwise_state_dim, config.pairwise_head_width, pairwise_num_heads, inf=1e9, starting=True
+        )
+        self.tri_att_end = EsmFoldTriangleAttention(
+            pairwise_state_dim, config.pairwise_head_width, pairwise_num_heads, inf=1e9, starting=False
+        )
+
+        self.mlp_seq = EsmFoldResidueMLP(sequence_state_dim, 4 * sequence_state_dim, dropout=config.dropout)
+        self.mlp_pair = EsmFoldResidueMLP(pairwise_state_dim, 4 * pairwise_state_dim, dropout=config.dropout)
+
+        self.drop = nn.Dropout(config.dropout)
+        self.row_drop = EsmFoldDropout(config.dropout * 2, 2)
+        self.col_drop = EsmFoldDropout(config.dropout * 2, 1)
+
+    def forward(self, sequence_state, pairwise_state, mask=None, chunk_size=None, **__kwargs):
+        """
+        Inputs:
+          sequence_state: B x L x sequence_state_dim pairwise_state: B x L x L x pairwise_state_dim mask: B x L boolean
+          tensor of valid positions
+
+        Output:
+          sequence_state: B x L x sequence_state_dim pairwise_state: B x L x L x pairwise_state_dim
+        """
+        if len(sequence_state.shape) != 3:
+            raise ValueError(f"`sequence_state` should be a 3d-tensor, got {len(sequence_state.shape)} dims.")
+        if len(pairwise_state.shape) != 4:
+            raise ValueError(f"`pairwise_state` should be a 4d-tensor, got {len(pairwise_state.shape)} dims.")
+        if mask is not None and len(mask.shape) != 2:
+            raise ValueError(f"`mask` should be a 2d-tensor, got {len(mask.shape)} dims.")
+
+        batch_dim, seq_dim, sequence_state_dim = sequence_state.shape
+        pairwise_state_dim = pairwise_state.shape[3]
+
+        if sequence_state_dim != self.config.sequence_state_dim:
+            raise ValueError(
+                "`sequence_state` last dimension should be equal to `self.sequence_state_dim`. Got "
+                f"{sequence_state_dim} != {self.config.sequence_state_dim}."
+            )
+        if pairwise_state_dim != self.config.pairwise_state_dim:
+            raise ValueError(
+                "`pairwise_state` last dimension should be equal to `self.pairwise_state_dim`. Got "
+                f"{pairwise_state_dim} != {self.config.pairwise_state_dim}."
+            )
+        if batch_dim != pairwise_state.shape[0]:
+            raise ValueError(
+                f"`sequence_state` and `pairwise_state` have inconsistent batch size: {batch_dim} != "
+                f"{pairwise_state.shape[0]}."
+            )
+        if seq_dim != pairwise_state.shape[1] or seq_dim != pairwise_state.shape[2]:
+            raise ValueError(
+                f"`sequence_state` and `pairwise_state` have inconsistent sequence length: {seq_dim} != "
+                f"{pairwise_state.shape[1]} or {pairwise_state.shape[2]}."
+            )
+
+        # Update sequence state
+        bias = self.pair_to_sequence(pairwise_state)
+
+        # Self attention with bias + mlp.
+        y = self.layernorm_1(sequence_state)
+        y, _ = self.seq_attention(y, mask=mask, bias=bias)
+        sequence_state = sequence_state + self.drop(y)
+        sequence_state = self.mlp_seq(sequence_state)
+
+        # Update pairwise state
+        pairwise_state = pairwise_state + self.sequence_to_pair(sequence_state)
+
+        # Axial attention with triangular bias.
+        tri_mask = mask.unsqueeze(2) * mask.unsqueeze(1) if mask is not None else None
+        pairwise_state = pairwise_state + self.row_drop(self.tri_mul_out(pairwise_state, mask=tri_mask))
+        pairwise_state = pairwise_state + self.col_drop(self.tri_mul_in(pairwise_state, mask=tri_mask))
+        pairwise_state = pairwise_state + self.row_drop(
+            self.tri_att_start(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
+        )
+        pairwise_state = pairwise_state + self.col_drop(
+            self.tri_att_end(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
+        )
+
+        # MLP over pairs.
+        pairwise_state = self.mlp_pair(pairwise_state)
+
+        return sequence_state, pairwise_state
+
+
+class EsmCategoricalMixture:
+    def __init__(self, param, bins=50, start=0, end=1):
+        # All tensors are of shape ..., bins.
+        self.logits = param
+        bins = torch.linspace(start, end, bins + 1, device=self.logits.device, dtype=self.logits.dtype)
+        self.v_bins = (bins[:-1] + bins[1:]) / 2
+
+    def log_prob(self, true):
+        # Shapes are:
+        #     self.probs: ... x bins
+        #     true      : ...
+        true_index = (true.unsqueeze(-1) - self.v_bins[[None] * true.ndim]).abs().argmin(-1)
+        nll = self.logits.log_softmax(-1)
+        return torch.take_along_dim(nll, true_index.unsqueeze(-1), dim=-1).squeeze(-1)
+
+    def mean(self):
+        return (self.logits.softmax(-1) @ self.v_bins.unsqueeze(1)).squeeze(-1)
+
+
+def categorical_lddt(logits, bins=50):
+    # Logits are ..., 37, bins.
+    return EsmCategoricalMixture(logits, bins=bins).mean()
+
+
+def get_axial_mask(mask):
+    """
+    Helper to convert B x L mask of valid positions to axial mask used in row column attentions.
+
+    Input:
+      mask: B x L tensor of booleans
+
+    Output:
+      mask: B x L x L tensor of booleans
+    """
+
+    if mask is None:
+        return None
+
+    if len(mask.shape) != 2:
+        raise ValueError(f"`mask` should be a 2d-tensor, got {len(mask.shape)} dims.")
+    batch_dim, seq_dim = mask.shape
+    m = mask.unsqueeze(1).expand(batch_dim, seq_dim, seq_dim)
+    m = m.reshape(batch_dim * seq_dim, seq_dim)
+    return m
+
+
+class EsmFoldRelativePosition(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.bins = config.position_bins
+
+        # Note an additional offset is used so that the 0th position
+        # is reserved for masked pairs.
+        self.embedding = torch.nn.Embedding(2 * self.bins + 2, config.pairwise_state_dim)
+
+    def forward(self, residue_index, mask=None):
+        """
+        Input:
+          residue_index: B x L tensor of indices (dtype=torch.long) mask: B x L tensor of booleans
+
+        Output:
+          pairwise_state: B x L x L x pairwise_state_dim tensor of embeddings
+        """
+        if residue_index.dtype != torch.long:
+            raise ValueError(f"`residue_index` has dtype {residue_index.dtype}, it should be `torch.long`.")
+        if mask is not None and residue_index.shape != mask.shape:
+            raise ValueError(
+                f"`residue_index` and `mask` have inconsistent shapes: {residue_index.shape} != {mask.shape}."
+            )
+
+        diff = residue_index[:, None, :] - residue_index[:, :, None]
+        diff = diff.clamp(-self.bins, self.bins)
+        diff = diff + self.bins + 1  # Add 1 to adjust for padding index.
+
+        if mask is not None:
+            mask = mask[:, None, :] * mask[:, :, None]
+            diff[mask == False] = 0  # noqa: E712
+
+        output = self.embedding(diff)
+        return output
+
+
+class EsmFoldAngleResnetBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.linear_1 = EsmFoldLinear(config.resnet_dim, config.resnet_dim, init="relu")
+        self.linear_2 = EsmFoldLinear(config.resnet_dim, config.resnet_dim, init="final")
+
+        self.relu = nn.ReLU()
+
+    def forward(self, a: torch.Tensor) -> torch.Tensor:
+        s_initial = a
+
+        a = self.relu(a)
+        a = self.linear_1(a)
+        a = self.relu(a)
+        a = self.linear_2(a)
+
+        return a + s_initial
+
+
+class EsmFoldAngleResnet(nn.Module):
+    """
+    Implements Algorithm 20, lines 11-14
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.linear_in = EsmFoldLinear(config.sequence_dim, config.resnet_dim)
+        self.linear_initial = EsmFoldLinear(config.sequence_dim, config.resnet_dim)
+
+        self.layers = nn.ModuleList()
+        for _ in range(config.num_resnet_blocks):
+            layer = EsmFoldAngleResnetBlock(config)
+            self.layers.append(layer)
+
+        self.linear_out = EsmFoldLinear(config.resnet_dim, config.num_angles * 2)
+
+        self.relu = nn.ReLU()
+
+    def forward(self, s: torch.Tensor, s_initial: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            s:
+                [*, C_hidden] single embedding
+            s_initial:
+                [*, C_hidden] single embedding as of the start of the StructureModule
+        Returns:
+            [*, no_angles, 2] predicted angles
+        """
+        # NOTE: The ReLU's applied to the inputs are absent from the supplement
+        # pseudocode but present in the source. For maximal compatibility with
+        # the pretrained weights, I'm going with the source.
+
+        # [*, C_hidden]
+        s_initial = self.relu(s_initial)
+        s_initial = self.linear_initial(s_initial)
+        s = self.relu(s)
+        s = self.linear_in(s)
+        s = s + s_initial
+
+        for l in self.layers:
+            s = l(s)
+
+        s = self.relu(s)
+
+        # [*, no_angles * 2]
+        s = self.linear_out(s)
+
+        # [*, no_angles, 2]
+        s = s.view(s.shape[:-1] + (-1, 2))
+
+        unnormalized_s = s
+        norm_denom = torch.sqrt(
+            torch.clamp(
+                torch.sum(s**2, dim=-1, keepdim=True),
+                min=self.config.epsilon,
+            )
+        )
+        s = s / norm_denom
+
+        return unnormalized_s, s
+
+
+class EsmFoldInvariantPointAttention(nn.Module):
+    """
+    Implements Algorithm 22.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        c_s = config.sequence_dim
+        c_z = config.pairwise_dim
+        self.hidden_dim = config.ipa_dim
+        self.num_heads = config.num_heads_ipa
+        self.num_qk_points = config.num_qk_points
+        self.num_v_points = config.num_v_points
+
+        # These linear layers differ from their specifications in the
+        # supplement. There, they lack bias and use Glorot initialization.
+        # Here as in the official source, they have bias and use the default
+        # Lecun initialization.
+        hc = config.ipa_dim * config.num_heads_ipa
+        self.linear_q = EsmFoldLinear(c_s, hc)
+        self.linear_kv = EsmFoldLinear(c_s, 2 * hc)
+
+        hpq = config.num_heads_ipa * config.num_qk_points * 3
+        self.linear_q_points = EsmFoldLinear(c_s, hpq)
+
+        hpkv = config.num_heads_ipa * (config.num_qk_points + config.num_v_points) * 3
+        self.linear_kv_points = EsmFoldLinear(c_s, hpkv)
+
+        self.linear_b = EsmFoldLinear(c_z, config.num_heads_ipa)
+
+        self.head_weights = nn.Parameter(torch.zeros(config.num_heads_ipa))
+
+        concat_out_dim = config.num_heads_ipa * (c_z + config.ipa_dim + config.num_v_points * 4)
+        self.linear_out = EsmFoldLinear(concat_out_dim, c_s, init="final")
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.softplus = nn.Softplus()
+
+    def forward(
+        self,
+        s: torch.Tensor,
+        z: Optional[torch.Tensor],
+        r: Rigid,
+        mask: torch.Tensor,
+        _offload_inference: bool = False,
+        _z_reference_list: Optional[Sequence[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            s:
+                [*, N_res, C_s] single representation
+            z:
+                [*, N_res, N_res, C_z] pair representation
+            r:
+                [*, N_res] transformation object
+            mask:
+                [*, N_res] mask
+        Returns:
+            [*, N_res, C_s] single representation update
+        """
+        z = [z]
+
+        #######################################
+        # Generate scalar and point activations
+        #######################################
+        # [*, N_res, H * C_hidden]
+        q = self.linear_q(s)
+        kv = self.linear_kv(s)
+
+        # [*, N_res, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.num_heads, -1))
+
+        # [*, N_res, H, 2 * C_hidden]
+        kv = kv.view(kv.shape[:-1] + (self.num_heads, -1))
+
+        # [*, N_res, H, C_hidden]
+        k, v = torch.split(kv, self.hidden_dim, dim=-1)
+
+        # [*, N_res, H * P_q * 3]
+        q_pts = self.linear_q_points(s)
+
+        # This is kind of clunky, but it's how the original does it
+        # [*, N_res, H * P_q, 3]
+        q_pts = torch.split(q_pts, q_pts.shape[-1] // 3, dim=-1)
+        q_pts = torch.stack(q_pts, dim=-1)
+        q_pts = r[..., None].apply(q_pts)
+
+        # [*, N_res, H, P_q, 3]
+        q_pts = q_pts.view(q_pts.shape[:-2] + (self.num_heads, self.num_qk_points, 3))
+
+        # [*, N_res, H * (P_q + P_v) * 3]
+        kv_pts = self.linear_kv_points(s)
+
+        # [*, N_res, H * (P_q + P_v), 3]
+        kv_pts = torch.split(kv_pts, kv_pts.shape[-1] // 3, dim=-1)
+        kv_pts = torch.stack(kv_pts, dim=-1)
+        kv_pts = r[..., None].apply(kv_pts)
+
+        # [*, N_res, H, (P_q + P_v), 3]
+        kv_pts = kv_pts.view(kv_pts.shape[:-2] + (self.num_heads, -1, 3))
+
+        # [*, N_res, H, P_q/P_v, 3]
+        k_pts, v_pts = torch.split(kv_pts, [self.num_qk_points, self.num_v_points], dim=-2)
+
+        ##########################
+        # Compute attention scores
+        ##########################
+        # [*, N_res, N_res, H]
+        b = self.linear_b(z[0])
+
+        if _offload_inference:
+            assert sys.getrefcount(z[0]) == 2
+            z[0] = z[0].cpu()
+
+        # [*, H, N_res, N_res]
+        device_type = q.device.type if q.device.type != "mps" else "cpu"
+        if is_fp16_enabled(device_type):
+            with torch.autocast(device_type=device_type, enabled=False):
+                a = torch.matmul(
+                    permute_final_dims(q.float(), (1, 0, 2)),  # [*, H, N_res, C_hidden]
+                    permute_final_dims(k.float(), (1, 2, 0)),  # [*, H, C_hidden, N_res]
+                )
+        else:
+            a = torch.matmul(
+                permute_final_dims(q, (1, 0, 2)),  # [*, H, N_res, C_hidden]
+                permute_final_dims(k, (1, 2, 0)),  # [*, H, C_hidden, N_res]
+            )
+
+        a *= math.sqrt(1.0 / (3 * self.hidden_dim))
+        a += math.sqrt(1.0 / 3) * permute_final_dims(b, (2, 0, 1))
+
+        # [*, N_res, N_res, H, P_q, 3]
+        pt_att = q_pts.unsqueeze(-4) - k_pts.unsqueeze(-5)
+        pt_att = pt_att**2
+
+        # [*, N_res, N_res, H, P_q]
+        pt_att = sum(torch.unbind(pt_att, dim=-1))
+        head_weights = self.softplus(self.head_weights).view(*((1,) * len(pt_att.shape[:-2]) + (-1, 1)))
+        head_weights = head_weights * math.sqrt(1.0 / (3 * (self.num_qk_points * 9.0 / 2)))
+        pt_att = pt_att * head_weights
+
+        # [*, N_res, N_res, H]
+        pt_att = torch.sum(pt_att, dim=-1) * (-0.5)
+        # [*, N_res, N_res]
+        square_mask = mask.unsqueeze(-1) * mask.unsqueeze(-2)
+        square_mask = self.config.inf * (square_mask - 1)
+
+        # [*, H, N_res, N_res]
+        pt_att = permute_final_dims(pt_att, (2, 0, 1))
+
+        a = a + pt_att
+        a = a + square_mask.unsqueeze(-3)
+        a = self.softmax(a)
+
+        ################
+        # Compute output
+        ################
+        # [*, N_res, H, C_hidden]
+        o = torch.matmul(a, v.transpose(-2, -3).to(dtype=a.dtype)).transpose(-2, -3)
+
+        # [*, N_res, H * C_hidden]
+        o = flatten_final_dims(o, 2)
+
+        # [*, H, 3, N_res, P_v]
+        o_pt = torch.sum(
+            (a[..., None, :, :, None] * permute_final_dims(v_pts, (1, 3, 0, 2))[..., None, :, :]),
+            dim=-2,
+        )
+
+        # [*, N_res, H, P_v, 3]
+        o_pt = permute_final_dims(o_pt, (2, 0, 3, 1))
+        o_pt = r[..., None, None].invert_apply(o_pt)
+
+        # [*, N_res, H * P_v]
+        o_pt_norm = flatten_final_dims(torch.sqrt(torch.sum(o_pt**2, dim=-1) + self.config.epsilon), 2)
+
+        # [*, N_res, H * P_v, 3]
+        o_pt = o_pt.reshape(*o_pt.shape[:-3], -1, 3)
+
+        if _offload_inference:
+            z[0] = z[0].to(o_pt.device)
+
+        # [*, N_res, H, C_z]
+        o_pair = torch.matmul(a.transpose(-2, -3), z[0].to(dtype=a.dtype))
+
+        # [*, N_res, H * C_z]
+        o_pair = flatten_final_dims(o_pair, 2)
+
+        # [*, N_res, C_s]
+        s = self.linear_out(
+            torch.cat((o, *torch.unbind(o_pt, dim=-1), o_pt_norm, o_pair), dim=-1).to(dtype=z[0].dtype)
+        )
+
+        return s
+
+
+class EsmFoldBackboneUpdate(nn.Module):
+    """
+    Implements part of Algorithm 23.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.linear = EsmFoldLinear(config.sequence_dim, 6, init="final")
+
+    def forward(self, s: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            [*, N_res, C_s] single representation
+        Returns:
+            [*, N_res, 6] update vector
+        """
+        # [*, 6]
+        update = self.linear(s)
+
+        return update
+
+
+class EsmFoldStructureModuleTransitionLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.linear_1 = EsmFoldLinear(config.sequence_dim, config.sequence_dim, init="relu")
+        self.linear_2 = EsmFoldLinear(config.sequence_dim, config.sequence_dim, init="relu")
+        self.linear_3 = EsmFoldLinear(config.sequence_dim, config.sequence_dim, init="final")
+
+        self.relu = nn.ReLU()
+
+    def forward(self, s):
+        s_initial = s
+        s = self.linear_1(s)
+        s = self.relu(s)
+        s = self.linear_2(s)
+        s = self.relu(s)
+        s = self.linear_3(s)
+
+        s = s + s_initial
+
+        return s
+
+
+class EsmFoldStructureModuleTransition(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.layers = nn.ModuleList()
+        for _ in range(config.num_transition_layers):
+            l = EsmFoldStructureModuleTransitionLayer(config)
+            self.layers.append(l)
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.layer_norm = LayerNorm(config.sequence_dim)
+
+    def forward(self, s):
+        for l in self.layers:
+            s = l(s)
+
+        s = self.dropout(s)
+        s = self.layer_norm(s)
+
+        return s
+
+
+class EsmFoldStructureModule(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        # Buffers to be lazily initialized later
+        # self.default_frames
+        # self.group_idx
+        # self.atom_mask
+        # self.lit_positions
+
+        self.layer_norm_s = LayerNorm(config.sequence_dim)
+        self.layer_norm_z = LayerNorm(config.pairwise_dim)
+
+        self.linear_in = EsmFoldLinear(config.sequence_dim, config.sequence_dim)
+
+        self.ipa = EsmFoldInvariantPointAttention(config)
+
+        self.ipa_dropout = nn.Dropout(config.dropout_rate)
+        self.layer_norm_ipa = LayerNorm(config.sequence_dim)
+
+        self.transition = EsmFoldStructureModuleTransition(config)
+        self.bb_update = EsmFoldBackboneUpdate(config)
+        self.angle_resnet = EsmFoldAngleResnet(config)
+
+    def forward(
+        self,
+        evoformer_output_dict,
+        aatype,
+        mask=None,
+        _offload_inference=False,
+    ):
+        """
+        Args:
+            evoformer_output_dict:
+                Dictionary containing:
+                    "single":
+                        [*, N_res, C_s] single representation
+                    "pair":
+                        [*, N_res, N_res, C_z] pair representation
+            aatype:
+                [*, N_res] amino acid indices
+            mask:
+                Optional [*, N_res] sequence mask
+        Returns:
+            A dictionary of outputs
+        """
+        s = evoformer_output_dict["single"]
+
+        if mask is None:
+            # [*, N]
+            mask = s.new_ones(s.shape[:-1])
+
+        # [*, N, C_s]
+        s = self.layer_norm_s(s)
+
+        # [*, N, N, C_z]
+        z = self.layer_norm_z(evoformer_output_dict["pair"])
+
+        z_reference_list = None
+        if _offload_inference:
+            assert sys.getrefcount(evoformer_output_dict["pair"]) == 2
+            evoformer_output_dict["pair"] = evoformer_output_dict["pair"].cpu()
+            z_reference_list = [z]
+            z = None
+
+        # [*, N, C_s]
+        s_initial = s
+        s = self.linear_in(s)
+
+        # [*, N]
+        rigids = Rigid.identity(
+            s.shape[:-1],
+            s.dtype,
+            s.device,
+            self.training,
+            fmt="quat",
+        )
+        outputs = []
+        for i in range(self.config.num_blocks):
+            # [*, N, C_s]
+            s = s + self.ipa(
+                s,
+                z,
+                rigids,
+                mask,
+                _offload_inference=_offload_inference,
+                _z_reference_list=z_reference_list,
+            )
+            s = self.ipa_dropout(s)
+            s = self.layer_norm_ipa(s)
+            s = self.transition(s)
+
+            # [*, N]
+            rigids = rigids.compose_q_update_vec(self.bb_update(s))
+
+            # To hew as closely as possible to AlphaFold, we convert our
+            # quaternion-based transformations to rotation-matrix ones
+            # here
+            backb_to_global = Rigid(
+                Rotation(rot_mats=rigids.get_rots().get_rot_mats(), quats=None),
+                rigids.get_trans(),
+            )
+
+            backb_to_global = backb_to_global.scale_translation(self.config.trans_scale_factor)
+
+            # [*, N, 7, 2]
+            unnormalized_angles, angles = self.angle_resnet(s, s_initial)
+
+            all_frames_to_global = self.torsion_angles_to_frames(backb_to_global, angles, aatype)
+
+            pred_xyz = self.frames_and_literature_positions_to_atom14_pos(all_frames_to_global, aatype)
+
+            scaled_rigids = rigids.scale_translation(self.config.trans_scale_factor)
+
+            preds = {
+                "frames": scaled_rigids.to_tensor_7(),
+                "sidechain_frames": all_frames_to_global.to_tensor_4x4(),
+                "unnormalized_angles": unnormalized_angles,
+                "angles": angles,
+                "positions": pred_xyz,
+                "states": s,
+            }
+
+            outputs.append(preds)
+
+            rigids = rigids.stop_rot_gradient()
+
+        del z, z_reference_list
+
+        if _offload_inference:
+            evoformer_output_dict["pair"] = evoformer_output_dict["pair"].to(s.device)
+
+        outputs = dict_multimap(torch.stack, outputs)
+        outputs["single"] = s
+
+        return outputs
+
+    def _init_residue_constants(self, float_dtype, device):
+        if not hasattr(self, "default_frames"):
+            self.register_buffer(
+                "default_frames",
+                torch.tensor(
+                    residue_constants.restype_rigid_group_default_frame,
+                    dtype=float_dtype,
+                    device=device,
+                    requires_grad=False,
+                ),
+                persistent=False,
+            )
+        if not hasattr(self, "group_idx"):
+            self.register_buffer(
+                "group_idx",
+                torch.tensor(
+                    residue_constants.restype_atom14_to_rigid_group,
+                    device=device,
+                    requires_grad=False,
+                ),
+                persistent=False,
+            )
+        if not hasattr(self, "atom_mask"):
+            self.register_buffer(
+                "atom_mask",
+                torch.tensor(
+                    residue_constants.restype_atom14_mask,
+                    dtype=float_dtype,
+                    device=device,
+                    requires_grad=False,
+                ),
+                persistent=False,
+            )
+        if not hasattr(self, "lit_positions"):
+            self.register_buffer(
+                "lit_positions",
+                torch.tensor(
+                    residue_constants.restype_atom14_rigid_group_positions,
+                    dtype=float_dtype,
+                    device=device,
+                    requires_grad=False,
+                ),
+                persistent=False,
+            )
+
+    def torsion_angles_to_frames(self, r, alpha, f):
+        # Lazily initialize the residue constants on the correct device
+        self._init_residue_constants(alpha.dtype, alpha.device)
+        # Separated purely to make testing less annoying
+        return torsion_angles_to_frames(r, alpha, f, self.default_frames)
+
+    def frames_and_literature_positions_to_atom14_pos(self, r, f):  # [*, N, 8]  # [*, N]
+        # Lazily initialize the residue constants on the correct device
+        self._init_residue_constants(r.get_rots().dtype, r.get_rots().device)
+        return frames_and_literature_positions_to_atom14_pos(
+            r,
+            f,
+            self.default_frames,
+            self.group_idx,
+            self.atom_mask,
+            self.lit_positions,
+        )
+
+
+class EsmFoldingTrunk(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        c_s = config.sequence_state_dim
+        c_z = config.pairwise_state_dim
+
+        self.pairwise_positional_embedding = EsmFoldRelativePosition(config)
+
+        self.blocks = nn.ModuleList([EsmFoldTriangularSelfAttentionBlock(config) for _ in range(config.num_blocks)])
+
+        self.recycle_bins = 15
+        self.recycle_s_norm = nn.LayerNorm(c_s)
+        self.recycle_z_norm = nn.LayerNorm(c_z)
+        self.recycle_disto = nn.Embedding(self.recycle_bins, c_z)
+        self.recycle_disto.weight[0].detach().zero_()
+
+        self.structure_module = EsmFoldStructureModule(config.structure_module)
+        self.trunk2sm_s = nn.Linear(c_s, config.structure_module.sequence_dim)
+        self.trunk2sm_z = nn.Linear(c_z, config.structure_module.pairwise_dim)
+
+        self.chunk_size = config.chunk_size
+
+    def set_chunk_size(self, chunk_size):
+        # This parameter means the axial attention will be computed
+        # in a chunked manner. This should make the memory used more or less O(L) instead of O(L^2).
+        # It's equivalent to running a for loop over chunks of the dimension we're iterative over,
+        # where the chunk_size is the size of the chunks, so 128 would mean to parse 128-length chunks.
+        self.chunk_size = chunk_size
+
+    def forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles):
+        """
+        Inputs:
+          seq_feats: B x L x C tensor of sequence features pair_feats: B x L x L x C tensor of pair features residx: B
+          x L long tensor giving the position in the sequence mask: B x L boolean tensor indicating valid residues
+
+        Output:
+          predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object
+        """
+
+        device = seq_feats.device
+        s_s_0 = seq_feats
+        s_z_0 = pair_feats
+
+        if no_recycles is None:
+            no_recycles = self.config.max_recycles
+        else:
+            if no_recycles < 0:
+                raise ValueError("Number of recycles must not be negative.")
+            no_recycles += 1  # First 'recycle' is just the standard forward pass through the model.
+
+        def trunk_iter(s, z, residx, mask):
+            z = z + self.pairwise_positional_embedding(residx, mask=mask)
+
+            for block in self.blocks:
+                s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
+            return s, z
+
+        s_s = s_s_0
+        s_z = s_z_0
+        recycle_s = torch.zeros_like(s_s)
+        recycle_z = torch.zeros_like(s_z)
+        recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64)
+
+        for recycle_idx in range(no_recycles):
+            with ContextManagers([] if recycle_idx == no_recycles - 1 else [torch.no_grad()]):
+                # === Recycling ===
+                recycle_s = self.recycle_s_norm(recycle_s.detach()).to(device)
+                recycle_z = self.recycle_z_norm(recycle_z.detach()).to(device)
+                recycle_z += self.recycle_disto(recycle_bins.detach()).to(device)
+
+                s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask)
+
+                # === Structure module ===
+                structure = self.structure_module(
+                    {"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)},
+                    true_aa,
+                    mask.float(),
+                )
+
+                recycle_s = s_s
+                recycle_z = s_z
+                # Distogram needs the N, CA, C coordinates, and bin constants same as alphafold.
+                recycle_bins = EsmFoldingTrunk.distogram(
+                    structure["positions"][-1][:, :, :3],
+                    3.375,
+                    21.375,
+                    self.recycle_bins,
+                )
+
+        structure["s_s"] = s_s
+        structure["s_z"] = s_z
+
+        return structure
+
+    @staticmethod
+    def distogram(coords, min_bin, max_bin, num_bins):
+        # Coords are [... L x 3 x 3], where it's [N, CA, C] x 3 coordinates.
+        boundaries = torch.linspace(
+            min_bin,
+            max_bin,
+            num_bins - 1,
+            device=coords.device,
+        )
+        boundaries = boundaries**2
+        N, CA, C = [x.squeeze(-2) for x in coords.chunk(3, dim=-2)]
+        # Infer CB coordinates.
+        b = CA - N
+        c = C - CA
+        a = b.cross(c, dim=-1)
+        CB = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + CA
+        dists = (CB[..., None, :, :] - CB[..., :, None, :]).pow(2).sum(dim=-1, keepdims=True)
+        bins = torch.sum(dists > boundaries, dim=-1)  # [..., L, L]
+        return bins
+
+
+# TODO Add information to the docstring about any methods that convert to PDB format, or otherwise prepare
+#      the outputs for downstream use.
+
+
+@auto_docstring(
+    custom_intro="""
+    ESMForProteinFolding is the HuggingFace port of the original ESMFold model. It consists of an ESM-2 "stem" followed
+    by a protein folding "head", although unlike most other output heads, this "head" is similar in size and runtime to
+    the rest of the model combined! It outputs a dictionary containing predicted structural information about the input
+    protein(s).
+    """
+)
+class EsmForProteinFolding(EsmPreTrainedModel):
+    _no_split_modules = ["EsmFoldStructureModule", "EsmFoldTriangularSelfAttentionBlock"]
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_attention_backend = False
+
+    _can_record_outputs = None
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.config = config
+
+        self.distogram_bins = 64
+
+        self.esm = EsmModel(config, add_pooling_layer=False)
+
+        self.esm.requires_grad_(False)
+        if self.config.esmfold_config.fp16_esm:
+            self.esm.half()
+
+        self.esm_feats = self.config.hidden_size
+        self.esm_attns = self.config.num_hidden_layers * self.config.num_attention_heads
+        self.esm_layers = self.config.num_hidden_layers
+        self.register_buffer("af2_to_esm", self._af2_to_esm_from_vocab_list(config.vocab_list))
+        self.esm_s_combine = nn.Parameter(torch.zeros(self.esm_layers + 1))
+
+        trunk_config = self.config.esmfold_config.trunk
+        c_s = trunk_config.sequence_state_dim
+        c_z = trunk_config.pairwise_state_dim
+        self.esm_s_mlp = nn.Sequential(
+            LayerNorm(self.esm_feats),
+            nn.Linear(self.esm_feats, c_s),
+            nn.ReLU(),
+            nn.Linear(c_s, c_s),
+        )
+
+        # 0 is padding, N is unknown residues, N + 1 is mask.
+        self.n_tokens_embed = residue_constants.restype_num + 3
+        self.pad_idx = 0
+        self.unk_idx = self.n_tokens_embed - 2
+        self.mask_idx = self.n_tokens_embed - 1
+        self.esm_dict_cls_idx = self.config.vocab_list.index("<cls>")
+        self.esm_dict_mask_idx = self.config.vocab_list.index("<mask>")
+        self.esm_dict_eos_idx = self.config.vocab_list.index("<eos>")
+        self.esm_dict_padding_idx = self.config.vocab_list.index("<pad>")
+        if self.config.esmfold_config.embed_aa:
+            self.embedding = nn.Embedding(self.n_tokens_embed, c_s, padding_idx=0)
+
+        self.trunk = EsmFoldingTrunk(trunk_config)
+
+        self.distogram_head = nn.Linear(c_z, self.distogram_bins)
+        self.ptm_head = nn.Linear(c_z, self.distogram_bins)
+        self.lm_head = nn.Linear(c_s, self.n_tokens_embed)
+        self.lddt_bins = 50
+        structure_module_config = trunk_config.structure_module
+        self.lddt_head = nn.Sequential(
+            nn.LayerNorm(structure_module_config.sequence_dim),
+            nn.Linear(structure_module_config.sequence_dim, self.config.esmfold_config.lddt_head_hid_dim),
+            nn.Linear(self.config.esmfold_config.lddt_head_hid_dim, self.config.esmfold_config.lddt_head_hid_dim),
+            nn.Linear(self.config.esmfold_config.lddt_head_hid_dim, 37 * self.lddt_bins),
+        )
+
+    @staticmethod
+    def _af2_to_esm_from_vocab_list(vocab_list: list[str]) -> torch.Tensor:
+        # Remember that t is shifted from residue_constants by 1 (0 is padding).
+        esm_reorder = [vocab_list.index("<pad>")] + [vocab_list.index(v) for v in residue_constants.restypes_with_x]
+        return torch.tensor(esm_reorder)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        masking_pattern: Optional[torch.Tensor] = None,
+        num_recycles: Optional[int] = None,
+        output_hidden_states: Optional[bool] = False,
+    ) -> EsmForProteinFoldingOutput:
+        r"""
+        masking_pattern (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Locations of tokens to mask during training as a form of regularization. Mask values selected in `[0, 1]`.
+        num_recycles (`int`, *optional*, defaults to `None`):
+            Number of times to recycle the input sequence. If `None`, defaults to `config.num_recycles`. "Recycling"
+            consists of passing the output of the folding trunk back in as input to the trunk. During training, the
+            number of recycles should vary with each batch, to ensure that the model learns to output valid predictions
+            after each recycle. During inference, num_recycles should be set to the highest value that the model was
+            trained with for maximum accuracy. Accordingly, when this value is set to `None`, config.max_recycles is
+            used.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, EsmForProteinFolding
+
+        >>> model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+        >>> inputs = tokenizer(["MLKNVQVQLV"], return_tensors="pt", add_special_tokens=False)  # A tiny random peptide
+        >>> outputs = model(**inputs)
+        >>> folded_positions = outputs.positions
+        ```
+
+        """
+        cfg = self.config.esmfold_config
+
+        aa = input_ids  # B x L
+        B = aa.shape[0]
+        L = aa.shape[1]
+        device = input_ids.device
+        if attention_mask is None:
+            attention_mask = torch.ones_like(aa, device=device)
+        if position_ids is None:
+            position_ids = torch.arange(L, device=device).expand_as(input_ids)
+
+        # === ESM ===
+        esmaa = self.af2_idx_to_esm_idx(aa, attention_mask)
+
+        if masking_pattern is not None:
+            masked_aa, esmaa, mlm_targets = self.bert_mask(aa, esmaa, attention_mask, masking_pattern)
+        else:
+            masked_aa = aa
+            mlm_targets = None
+
+        # We get sequence and pair representations from whatever version of ESM /
+        # configuration we are using. The sequence representation esm_s is always
+        # present. The pair embedding esm_z may be present depending on the
+        # configuration of the model. If esm_z is not used by the model then it
+        # is returned as None here.
+        esm_s = self.compute_language_model_representations(esmaa)
+
+        # Convert esm_s and esm_z, if present, to the precision used by the trunk and
+        # the structure module. These tensors may be a lower precision if, for example,
+        # we're running the language model in fp16 precision.
+        esm_s = esm_s.to(self.esm_s_combine.dtype)
+
+        if cfg.esm_ablate_sequence:
+            esm_s = esm_s * 0
+
+        esm_s = esm_s.detach()
+
+        # === preprocessing ===
+        esm_s = (self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s).squeeze(2)
+        s_s_0 = self.esm_s_mlp(esm_s)
+
+        s_z_0 = s_s_0.new_zeros(B, L, L, cfg.trunk.pairwise_state_dim)
+
+        if self.config.esmfold_config.embed_aa:
+            s_s_0 += self.embedding(masked_aa)
+
+        structure: dict = self.trunk(s_s_0, s_z_0, aa, position_ids, attention_mask, no_recycles=num_recycles)
+        # Documenting what we expect:
+        structure = {
+            k: v
+            for k, v in structure.items()
+            if k
+            in [
+                "s_z",
+                "s_s",
+                "frames",
+                "sidechain_frames",
+                "unnormalized_angles",
+                "angles",
+                "positions",
+                "states",
+            ]
+        }
+
+        # Add BERT mask for the loss to use, if available.
+        if mlm_targets:
+            structure["mlm_targets"] = mlm_targets
+
+        disto_logits = self.distogram_head(structure["s_z"])
+        disto_logits = (disto_logits + disto_logits.transpose(1, 2)) / 2
+        structure["distogram_logits"] = disto_logits
+
+        lm_logits = self.lm_head(structure["s_s"])
+        structure["lm_logits"] = lm_logits
+
+        structure["aatype"] = aa
+        make_atom14_masks(structure)
+        # Of course, this doesn't respect the true mask because it doesn't know about it...
+        # We're not going to properly mask change of index tensors:
+        #    "residx_atom14_to_atom37",
+        #    "residx_atom37_to_atom14",
+        for k in [
+            "atom14_atom_exists",
+            "atom37_atom_exists",
+        ]:
+            structure[k] *= attention_mask.unsqueeze(-1)
+        structure["residue_index"] = position_ids
+
+        lddt_head = self.lddt_head(structure["states"]).reshape(structure["states"].shape[0], B, L, -1, self.lddt_bins)
+        structure["lddt_head"] = lddt_head
+        plddt = categorical_lddt(lddt_head[-1], bins=self.lddt_bins)
+        structure["plddt"] = plddt
+
+        ptm_logits = self.ptm_head(structure["s_z"])
+        structure["ptm_logits"] = ptm_logits
+        structure["ptm"] = compute_tm(ptm_logits, max_bin=31, no_bins=self.distogram_bins)
+        structure.update(compute_predicted_aligned_error(ptm_logits, max_bin=31, no_bins=self.distogram_bins))
+
+        return EsmForProteinFoldingOutput(**structure)
+
+    def af2_idx_to_esm_idx(self, aa, mask):
+        # avoid indexing on different devices
+        if self.af2_to_esm.device != aa.device:
+            self.af2_to_esm = self.af2_to_esm.to(aa.device)
+        aa = (aa + 1).masked_fill(mask != 1, 0)
+        return self.af2_to_esm[aa]
+
+    def compute_language_model_representations(self, esmaa: torch.Tensor) -> torch.Tensor:
+        device = next(self.parameters()).device
+        B, L = esmaa.shape  # B = batch size, L = sequence length.
+
+        if self.config.esmfold_config.bypass_lm:
+            esm_s = torch.zeros(B, L, self.esm_s_combine.size[0], -1, self.esm_feats, device=device)
+            return esm_s
+
+        bosi, eosi = self.esm_dict_cls_idx, self.esm_dict_eos_idx
+        bos = esmaa.new_full((B, 1), bosi)
+        eos = esmaa.new_full((B, 1), self.esm_dict_padding_idx)
+        esmaa = torch.cat([bos, esmaa, eos], dim=1)
+        # Use the first padding index as eos during inference.
+        esmaa[range(B), (esmaa != 1).sum(1)] = eosi
+
+        # _, esm_z, esm_s = self.esm(esmaa, return_pairs=self.config.esmfold_config.use_esm_attn_map)
+        # Because we do not support use_esm_attn_map in the HF port as it is not used in any public models,
+        # esm_z is always None
+        esm_hidden_states = self.esm(esmaa, attention_mask=esmaa != 1, output_hidden_states=True)["hidden_states"]
+        esm_s = torch.stack(esm_hidden_states, dim=2)
+
+        esm_s = esm_s[:, 1:-1]  # B, L, nLayers, C
+
+        return esm_s
+
+    def bert_mask(self, aa, esmaa, mask, pattern):
+        new_aa = aa.clone()
+        target = aa.clone()
+        new_esmaa = esmaa.clone()
+        new_aa[pattern == 1] = self.mask_idx
+        target[pattern != 1] = 0
+        new_esmaa[pattern == 1] = self.esm_dict_mask_idx
+        return new_aa, new_esmaa, target
+
+    @torch.no_grad()
+    def infer(
+        self,
+        seqs: Union[str, list[str]],
+        position_ids=None,
+    ):
+        if isinstance(seqs, str):
+            lst = [seqs]
+        else:
+            lst = seqs
+        # Returns the raw outputs of the model given an input sequence.
+        device = next(self.parameters()).device
+        aatype = collate_dense_tensors(
+            [
+                torch.from_numpy(
+                    residue_constants.sequence_to_onehot(
+                        sequence=seq,
+                        mapping=residue_constants.restype_order_with_x,
+                        map_unknown_to_x=True,
+                    )
+                )
+                .to(device)
+                .argmax(dim=1)
+                for seq in lst
+            ]
+        )  # B=1 x L
+        mask = collate_dense_tensors([aatype.new_ones(len(seq)) for seq in lst])
+        position_ids = (
+            torch.arange(aatype.shape[1], device=device).expand(len(lst), -1)
+            if position_ids is None
+            else position_ids.to(device)
+        )
+        if position_ids.ndim == 1:
+            position_ids = position_ids.unsqueeze(0)
+        return self.forward(
+            aatype,
+            mask,
+            position_ids=position_ids,
+        )
+
+    @staticmethod
+    def output_to_pdb(output: dict) -> list[str]:
+        """Returns the pbd (file) string from the model given the model output."""
+        output = {k: v.to("cpu").numpy() for k, v in output.items()}
+        pdbs = []
+        final_atom_positions = atom14_to_atom37(output["positions"][-1], output)
+        final_atom_mask = output["atom37_atom_exists"]
+        for i in range(output["aatype"].shape[0]):
+            aa = output["aatype"][i]
+            pred_pos = final_atom_positions[i]
+            mask = final_atom_mask[i]
+            resid = output["residue_index"][i] + 1
+            pred = OFProtein(
+                aatype=aa,
+                atom_positions=pred_pos,
+                atom_mask=mask,
+                residue_index=resid,
+                b_factors=output["plddt"][i],
+            )
+            pdbs.append(to_pdb(pred))
+        return pdbs
+
+    def infer_pdb(self, seqs, *args, **kwargs) -> str:
+        """Returns the pdb (file) string from the model given an input sequence."""
+        assert isinstance(seqs, str)
+        output = self.infer(seqs, *args, **kwargs)
+        return self.output_to_pdb(output)[0]
+
+    def infer_pdbs(self, seqs: list[str], *args, **kwargs) -> list[str]:
+        """Returns the pdb (file) string from the model given an input sequence."""
+        output = self.infer(seqs, *args, **kwargs)
+        return self.output_to_pdb(output)
+
+
+__all__ = ["EsmForProteinFolding", "EsmFoldPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/modeling_tf_esm.py b/phivenv/Lib/site-packages/transformers/models/esm/modeling_tf_esm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd066868f0e86cd2e130ad170eb38c4791f4f88
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/modeling_tf_esm.py
@@ -0,0 +1,1574 @@
+# coding=utf-8
+# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ESM model."""
+
+from __future__ import annotations
+
+import os
+
+import numpy as np
+import tensorflow as tf
+
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFMaskedLMOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    shape_list,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
+from ...utils import logging
+from .configuration_esm import EsmConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "facebook/esm2_t6_8M_UR50D"
+_CONFIG_FOR_DOC = "EsmConfig"
+
+
+def rotate_half(x):
+    x1, x2 = tf.split(x, 2, axis=-1)
+    return tf.concat((-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(x, cos, sin):
+    cos = cos[:, :, : tf.shape(x)[-2], :]
+    sin = sin[:, :, : tf.shape(x)[-2], :]
+
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+def symmetrize(x):
+    "Make layer symmetric in final two dimensions, used for contact prediction."
+    return x + tf.linalg.matrix_transpose(x)  # Transposes last two dimensions only
+
+
+def average_product_correct(x):
+    "Perform average product correct, used for contact prediction."
+    a1 = tf.reduce_sum(x, -1, keepdims=True)
+    a2 = tf.reduce_sum(x, -2, keepdims=True)
+    a12 = tf.reduce_sum(x, (-1, -2), keepdims=True)
+
+    avg = a1 * a2
+    avg = avg / a12
+    normalized = x - avg
+    return normalized
+
+
+class TFRotaryEmbedding(keras.layers.Layer):
+    """
+    Rotary position embeddings based on those in
+    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
+    matrices which depend on their relative positions.
+    """
+
+    def __init__(self, dim: int, name=None):
+        super().__init__(name=name)
+        # Matt: The PyTorch version of this layer does a lot of work to cache values, but we just rely on TF compilation
+        # and/or XLA to sort out constants like that. It actually may not seem like this layer needs to be stateful at
+        # all when we benefit from TF compilation, but it does. The reason is that self.inv_freq is a buffer in the
+        # original implementation, but all the shared ESM checkpoints were trained with fp16 params. This means that
+        # the inv_freq tensor was stored as a float16, and we need to replicate those lower-precision values or our
+        # models give different outputs from the original.
+        self.dim = dim
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.inv_freq = self.add_weight(
+            "inv_freq", shape=(self.dim // 2,), dtype=tf.float32, initializer=get_initializer(1.0), trainable=False
+        )
+        self.inv_freq.assign(
+            1.0 / (10000 ** (tf.range(start=0, limit=self.dim, delta=2, dtype=tf.float32) / self.dim))
+        )
+
+    def _compute_cos_sin(self, x, seq_dimension=2):
+        seq_len = tf.shape(x)[seq_dimension]
+
+        t = tf.range(seq_len, dtype=self.inv_freq.dtype)
+        freqs = tf.einsum("i, j -> ij", t, self.inv_freq)  # Outer multiplication
+        emb = tf.concat((freqs, freqs), axis=-1)[None, None, :, :]
+
+        return tf.cos(emb), tf.sin(emb)
+
+    def call(self, q: tf.Tensor, k: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor]:
+        cos_emb, sin_emb = self._compute_cos_sin(k, seq_dimension=-2)
+
+        return (
+            apply_rotary_pos_emb(q, cos_emb, sin_emb),
+            apply_rotary_pos_emb(k, cos_emb, sin_emb),
+        )
+
+
+class TFEsmContactPredictionHead(keras.layers.Layer):
+    """Performs symmetrization, apc, and computes a logistic regression on the output features"""
+
+    def __init__(
+        self,
+        in_features: int,
+        bias=True,
+        eos_idx: int = 2,
+        name=None,
+    ):
+        super().__init__(name=name)
+        self.eos_idx = eos_idx
+        self.in_features = in_features
+        self.regression = keras.layers.Dense(1, use_bias=bias, activation="sigmoid", name="regression")
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "regression", None) is not None:
+            with tf.name_scope(self.regression.name):
+                self.regression.build((None, self.in_features))
+
+    def call(self, tokens, attentions):
+        # remove eos token attentions
+        eos_mask = tf.cast(tokens != self.eos_idx, attentions.dtype)
+        eos_mask = tf.expand_dims(eos_mask, 1) * tf.expand_dims(eos_mask, 2)
+        attentions = attentions * eos_mask[:, None, None, :, :]
+        attentions = attentions[..., :-1, :-1]
+        # remove cls token attentions
+        attentions = attentions[..., 1:, 1:]
+        batch_size, layers, heads, seqlen, _ = shape_list(attentions)
+        attentions = tf.reshape(attentions, (batch_size, layers * heads, seqlen, seqlen))
+
+        # features: batch x channels x tokens x tokens (symmetric)
+        attentions = average_product_correct(symmetrize(attentions))
+        attentions = tf.transpose(attentions, perm=(0, 2, 3, 1))
+        return tf.squeeze(self.regression(attentions), 3)
+
+
+class TFEsmEmbeddings(keras.layers.Layer):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.word_embeddings = keras.layers.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="word_embeddings",
+        )
+        self.position_embeddings = keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="position_embeddings",
+        )
+
+        if config.emb_layer_norm_before:
+            self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        else:
+            self.layer_norm = None
+        # Matt: I think this line was copied incorrectly from BERT, disabling for now
+        # self.dropout = Dropout(config.hidden_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.position_ids = tf.range(config.max_position_embeddings)[None, :]
+
+        self.padding_idx = config.pad_token_id
+        self.token_dropout = config.token_dropout
+        self.mask_token_id = config.mask_token_id
+        self.config = config
+
+    def call(
+        self, input_ids=None, attention_mask=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # Note that if we want to support ESM-1 (not 1b!) in future then we need to support an
+        # embedding_scale factor here.
+        embeddings = inputs_embeds
+
+        # Matt: ESM has the option to handle masking in MLM in a slightly unusual way. If the token_dropout
+        # flag is False then it is handled in the same was as BERT/RoBERTa. If it is set to True, however,
+        # masked tokens are treated as if they were selected for input dropout and zeroed out.
+        # This "mask-dropout" is compensated for when masked tokens are not present, by scaling embeddings by
+        # a factor of (fraction of unmasked tokens during training) / (fraction of unmasked tokens in sample).
+        # This is analogous to the way that dropout layers scale down outputs during evaluation when not
+        # actually dropping out values (or, equivalently, scale up their un-dropped outputs in training).
+        if self.token_dropout:
+            embeddings = tf.where((input_ids == self.mask_token_id)[:, :, None], 0.0, embeddings)
+            mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all ESM model training runs
+            src_lengths = tf.cast(tf.reduce_sum(attention_mask, axis=-1), tf.float32)
+            masked_tokens = input_ids == self.mask_token_id
+            mask_ratio_observed = tf.math.count_nonzero(masked_tokens, dtype=tf.float32, axis=-1) / src_lengths
+            embeddings = embeddings * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        if self.layer_norm is not None:
+            embeddings = self.layer_norm(embeddings)
+        if attention_mask is not None:
+            embeddings = embeddings * tf.cast(tf.expand_dims(attention_mask, -1), embeddings.dtype)
+        # Matt: I think this line was copied incorrectly from BERT, disabling it for now.
+        # embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: tf.Tensor
+
+        Returns: tf.Tensor
+        """
+        input_shape = shape_list(inputs_embeds)[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = tf.range(
+            start=self.padding_idx + 1, limit=sequence_length + self.padding_idx + 1, dtype=tf.int64
+        )
+        return tf.broadcast_to(tf.expand_dims(position_ids, 0), input_shape)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "word_embeddings", None) is not None:
+            with tf.name_scope(self.word_embeddings.name):
+                self.word_embeddings.build(None)
+        if getattr(self, "position_embeddings", None) is not None:
+            with tf.name_scope(self.position_embeddings.name):
+                self.position_embeddings.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+
+
+class TFEsmSelfAttention(keras.layers.Layer):
+    def __init__(self, config, position_embedding_type=None, name=None):
+        super().__init__(name=name)
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+
+        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        self.rotary_embeddings = None
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = keras.layers.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size,
+                embeddings_initializer=get_initializer(config.initializer_range),
+            )
+        elif self.position_embedding_type == "rotary":
+            self.rotary_embeddings = TFRotaryEmbedding(dim=self.attention_head_size, name="rotary_embeddings")
+
+        self.is_decoder = config.is_decoder
+        self.config = config
+
+    def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
+        new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, perm=(0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_value: tuple[tuple[tf.Tensor]] | None = None,
+        output_attentions: bool | None = False,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Matt: Our BERT model (which this code was derived from) scales attention logits down by sqrt(head_dim).
+        # ESM scales the query down by the same factor instead. Modulo numerical stability these are equivalent,
+        # but not when rotary embeddings get involved. Therefore, we scale the query here to match the original
+        # ESM code and fix rotary embeddings.
+        query_layer = query_layer * self.attention_head_size**-0.5
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        if self.position_embedding_type == "rotary":
+            query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = shape_list(hidden_states)[1]
+            position_ids_l = tf.expand_dims(tf.range(seq_length, dtype=tf.int64), -1)
+            position_ids_r = tf.expand_dims(tf.range(seq_length, dtype=tf.int64), 0)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = tf.cast(positional_embedding, query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = tf.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in EsmModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = attention_probs @ value_layer
+
+        context_layer = tf.transpose(context_layer, perm=(0, 2, 1, 3))
+        new_context_layer_shape = shape_list(context_layer)[:-2] + [self.all_head_size]
+        context_layer = tf.reshape(context_layer, new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+        if getattr(self, "rotary_embeddings", None) is not None:
+            with tf.name_scope(self.rotary_embeddings.name):
+                self.rotary_embeddings.build(None)
+
+
+class TFEsmSelfOutput(keras.layers.Layer):
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.dense = keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states, input_tensor, training=False):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states += input_tensor
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFEsmAttention(keras.layers.Layer):
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.self = TFEsmSelfAttention(config, name="self")
+        self.output_layer = TFEsmSelfOutput(config, name="output")
+        self.pruned_heads = set()
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        training=False,
+    ):
+        hidden_states_ln = self.LayerNorm(hidden_states)
+        self_outputs = self.self(
+            hidden_states_ln,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            training,
+        )
+        attention_output = self.output_layer(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self", None) is not None:
+            with tf.name_scope(self.self.name):
+                self.self.build(None)
+        if getattr(self, "output_layer", None) is not None:
+            with tf.name_scope(self.output_layer.name):
+                self.output_layer.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFEsmIntermediate(keras.layers.Layer):
+    def __init__(self, config: EsmConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = tf.nn.gelu(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFEsmOutput(keras.layers.Layer):
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.dense = keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states, input_tensor, training=False):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states += input_tensor
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+
+
+class TFEsmLayer(keras.layers.Layer):
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = TFEsmAttention(config, name="attention")
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise RuntimeError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = TFEsmAttention(config)
+        self.intermediate = TFEsmIntermediate(config, name="intermediate")
+        self.output_layer = TFEsmOutput(config, name="output")
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        training=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise AttributeError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated"
+                    " with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layernorm_output = self.LayerNorm(attention_output)
+        intermediate_output = self.intermediate(hidden_states=layernorm_output)
+        layer_output = self.output_layer(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + outputs  # add attentions if we output them
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "output_layer", None) is not None:
+            with tf.name_scope(self.output_layer.name):
+                self.output_layer.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFEsmEncoder(keras.layers.Layer):
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.config = config
+        self.layer = [TFEsmLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+        self.emb_layer_norm_after = keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_eps, name="emb_layer_norm_after"
+        )
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        training=False,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+                training,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if self.emb_layer_norm_after:
+            hidden_states = self.emb_layer_norm_after(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "emb_layer_norm_after", None) is not None:
+            with tf.name_scope(self.emb_layer_norm_after.name):
+                self.emb_layer_norm_after.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Esm
+class TFEsmPooler(keras.layers.Layer):
+    def __init__(self, config: EsmConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFEsmPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EsmConfig
+    base_model_prefix = "esm"
+
+
+ESM_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Keras [Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it as a
+    regular Keras model and refer to the TF/Keras documentation for all matters related to general usage and behavior.
+
+    Parameters:
+        config ([`EsmConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ESM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ESM Model transformer outputting raw hidden-states without any specific head on top.",
+    ESM_START_DOCSTRING,
+)
+class TFEsmMainLayer(keras.layers.Layer):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config, add_pooling_layer=True, name=None, **kwargs):
+        super().__init__(name=name, **kwargs)
+
+        self.config = config
+        self.is_decoder = config.is_decoder
+
+        self.embeddings = TFEsmEmbeddings(config, name="embeddings")
+        self.encoder = TFEsmEncoder(config, name="encoder")
+        self.pooler = TFEsmPooler(config, name="pooler") if add_pooling_layer else None
+
+        self.contact_head = TFEsmContactPredictionHead(
+            in_features=self.config.num_hidden_layers * self.config.num_attention_heads, bias=True, name="contact_head"
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+        if getattr(self, "contact_head", None) is not None:
+            with tf.name_scope(self.contact_head.name):
+                self.contact_head.build(None)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.word_embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]:
+        if not self.config.is_decoder:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_key_values_length = 0
+            past_key_values = [None] * len(self.encoder.layer)
+        else:
+            past_key_values_length = shape_list(past_key_values[0][0])[-2]
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            training=training,
+        )
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask_shape = shape_list(attention_mask)
+
+        mask_seq_length = seq_length + past_key_values_length
+        # Copied from `modeling_tf_t5.py`
+        # Provided a padding mask of dimensions [batch_size, mask_seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+        if self.is_decoder:
+            seq_ids = tf.range(mask_seq_length)
+            causal_mask = tf.less_equal(
+                tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                seq_ids[None, :, None],
+            )
+            causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
+            extended_attention_mask = causal_mask * attention_mask[:, None, :]
+            attention_mask_shape = shape_list(extended_attention_mask)
+            extended_attention_mask = tf.reshape(
+                extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
+            )
+            if past_key_values[0] is not None:
+                # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length]
+                extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
+        else:
+            extended_attention_mask = tf.reshape(
+                attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.is_decoder and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def predict_contacts(self, tokens, attention_mask):
+        attns = self(tokens, attention_mask=attention_mask, return_dict=True, output_attentions=True).attentions
+        attns = tf.stack(attns, axis=1)  # Matches the original model layout
+        # In the original model, attentions for padding tokens are completely zeroed out.
+        # This makes no difference most of the time because the other tokens won't attend to them,
+        # but it does for the contact prediction task, which takes attentions as input,
+        # so we have to mimic that here.
+        attention_mask = tf.cast(attention_mask, attns.dtype)
+        attns *= attention_mask[:, None, None, None]
+        attns *= attention_mask[:, None, None, :, None]
+        return self.contact_head(tokens, attns)
+
+
+@add_start_docstrings(
+    "The bare ESM Model transformer outputting raw hidden-states without any specific head on top.",
+    ESM_START_DOCSTRING,
+)
+class TFEsmModel(TFEsmPreTrainedModel):
+    def __init__(self, config: EsmConfig, add_pooling_layer=True, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.esm = TFEsmMainLayer(config, add_pooling_layer=add_pooling_layer, name="esm")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        """
+        outputs = self.esm(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def predict_contacts(self, tokens, attention_mask):
+        return self.esm.predict_contacts(tokens, attention_mask)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "esm", None) is not None:
+            with tf.name_scope(self.esm.name):
+                self.esm.build(None)
+
+
+@add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING)
+class TFEsmForMaskedLM(TFEsmPreTrainedModel, TFMaskedLanguageModelingLoss):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.esm = TFEsmMainLayer(config, add_pooling_layer=False, name="esm")
+        self.lm_head = TFEsmLMHead(config, name="lm_head")
+        if config.tie_word_embeddings:
+            # Ensure word embeddings are built so that we actually have something to tie
+            with tf.name_scope(os.path.join(self._name_scope(), "esm", "embeddings", "word_embeddings")):
+                self.esm.embeddings.word_embeddings.build((None, None))
+            self.lm_head.decoder = self.esm.embeddings.word_embeddings.weights[0]
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`dict[str, any]`, *optional*, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.esm(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            masked_lm_loss = self.hf_compute_loss(labels=labels, logits=prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def predict_contacts(self, tokens, attention_mask):
+        return self.esm.predict_contacts(tokens, attention_mask)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "esm", None) is not None:
+            with tf.name_scope(self.esm.name):
+                self.esm.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+class TFEsmLMHead(keras.layers.Layer):
+    """ESM Head for masked language modeling."""
+
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.dense = keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        if config.tie_word_embeddings:
+            self.decoder = None
+        else:
+            self.decoder = keras.layers.Dense(
+                config.vocab_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="decoder",
+                use_bias=False,
+            )
+        self.config = config
+
+    def build(self, input_shape=None):
+        # Separate bias to match the PT model and allow weight cross-loading to work
+        # Put it in the build so it gets the right name when adding it as a weight
+        if self.built:
+            return
+        self.built = True
+        self.bias = self.add_weight("bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "decoder", None) is not None and not self.config.tie_word_embeddings:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build([None, None, self.config.hidden_size])
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def call(self, features):
+        x = self.dense(features)
+        x = tf.nn.gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        if self.config.tie_word_embeddings:
+            x = tf.matmul(x, self.decoder, transpose_b=True) + self.bias
+        else:
+            x = self.decoder(x) + self.bias
+        return x
+
+
+@add_start_docstrings(
+    """
+    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    ESM_START_DOCSTRING,
+)
+class TFEsmForSequenceClassification(TFEsmPreTrainedModel, TFSequenceClassificationLoss):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.esm = TFEsmMainLayer(config, add_pooling_layer=False, name="esm")
+        self.classifier = TFEsmClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.esm(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "esm", None) is not None:
+            with tf.name_scope(self.esm.name):
+                self.esm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    ESM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ESM_START_DOCSTRING,
+)
+class TFEsmForTokenClassification(TFEsmPreTrainedModel, TFTokenClassificationLoss):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.esm = TFEsmMainLayer(config, add_pooling_layer=False, name="esm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(config.num_labels, name="classifier")
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.esm(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "esm", None) is not None:
+            with tf.name_scope(self.esm.name):
+                self.esm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+class TFEsmClassificationHead(keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, name=None):
+        super().__init__(name=name)
+        self.dense = keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.out_proj = keras.layers.Dense(
+            config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="linear",
+            name="out_proj",
+        )
+        self.config = config
+
+    def call(self, features, training=False):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, training=training)
+        x = self.dense(x)
+        x = self.dropout(x, training=training)
+        x = self.out_proj(x)
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: tf.Tensor x:
+
+    Returns: tf.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = tf.cast(input_ids != padding_idx, tf.int64)
+    incremental_indices = (tf.cumsum(mask, axis=1) + past_key_values_length) * mask
+    return incremental_indices + padding_idx
+
+
+__all__ = [
+    "TFEsmForMaskedLM",
+    "TFEsmForSequenceClassification",
+    "TFEsmForTokenClassification",
+    "TFEsmModel",
+    "TFEsmPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__init__.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02a8c149ae320dd9b045edc5df31760a4eebefd9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__init__.py
@@ -0,0 +1,8 @@
+from .chunk_utils import chunk_layer
+from .data_transforms import make_atom14_masks
+from .feats import atom14_to_atom37, frames_and_literature_positions_to_atom14_pos, torsion_angles_to_frames
+from .loss import compute_predicted_aligned_error, compute_tm
+from .protein import Protein as OFProtein
+from .protein import to_pdb
+from .rigid_utils import Rigid, Rotation
+from .tensor_utils import dict_multimap, flatten_final_dims, permute_final_dims
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e86310ac5c7bca38527099ddd58fa16b22753f85
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/chunk_utils.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/chunk_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af12e3c1efb18b75608fd084c4ac87a6f8fd57f0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/chunk_utils.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/data_transforms.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/data_transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9ebf84fe1357c10afb4c00003e1169008ebcc8b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/data_transforms.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/feats.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/feats.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee434fda142922e7d8770a7d361124226f3fc67a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/feats.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/loss.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/loss.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a840ae5e529975dedf98e427fda1e95b33035b65
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/loss.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/protein.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/protein.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da4137dd308ebabf3c440018b5675a6fa755eb5f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/protein.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/residue_constants.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/residue_constants.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..249866baf5e4289695bc3ba03fa755c59e477139
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/residue_constants.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/rigid_utils.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/rigid_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b5f072167f1789dbffac4b66db487f224d98228
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/rigid_utils.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/tensor_utils.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/tensor_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08e7a20a31096a75045284b544a38f2bfd91936d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/__pycache__/tensor_utils.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..14703ba7d605dd9dbdbb55789d0ea960588048da
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py
@@ -0,0 +1,398 @@
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import math
+from collections.abc import Iterable, Sequence
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from .tensor_utils import tensor_tree_map, tree_map
+
+
+def _fetch_dims(tree: Union[dict, list, tuple, torch.Tensor]) -> list[tuple[int, ...]]:
+    shapes = []
+    if isinstance(tree, dict):
+        for v in tree.values():
+            shapes.extend(_fetch_dims(v))
+    elif isinstance(tree, (list, tuple)):
+        for t in tree:
+            shapes.extend(_fetch_dims(t))
+    elif isinstance(tree, torch.Tensor):
+        shapes.append(tree.shape)
+    else:
+        raise TypeError("Not supported")
+
+    return shapes
+
+
+@torch.jit.ignore
+def _flat_idx_to_idx(flat_idx: int, dims: tuple[int, ...]) -> tuple[int, ...]:
+    idx = []
+    for d in reversed(dims):
+        idx.append(flat_idx % d)
+        flat_idx = flat_idx // d
+
+    return tuple(reversed(idx))
+
+
+@torch.jit.ignore
+def _get_minimal_slice_set(
+    start: Sequence[int],
+    end: Sequence[int],
+    dims: Sequence[int],
+    start_edges: Optional[Sequence[bool]] = None,
+    end_edges: Optional[Sequence[bool]] = None,
+) -> list[tuple[slice, ...]]:
+    """
+    Produces an ordered sequence of tensor slices that, when used in sequence on a tensor with shape dims, yields
+    tensors that contain every leaf in the contiguous range [start, end]. Care is taken to yield a short sequence of
+    slices, and perhaps even the shortest possible (I'm pretty sure it's the latter).
+
+    end is INCLUSIVE.
+    """
+
+    # start_edges and end_edges both indicate whether, starting from any given
+    # dimension, the start/end index is at the top/bottom edge of the
+    # corresponding tensor, modeled as a tree
+    def reduce_edge_list(l: list[bool]) -> None:
+        tally = True
+        for i in range(len(l)):
+            reversed_idx = -1 * (i + 1)
+            l[reversed_idx] &= tally
+            tally = l[reversed_idx]
+
+    if start_edges is None:
+        start_edges = [s == 0 for s in start]
+        reduce_edge_list(start_edges)
+    if end_edges is None:
+        end_edges = [e == (d - 1) for e, d in zip(end, dims)]
+        reduce_edge_list(end_edges)
+
+    # Base cases. Either start/end are empty and we're done, or the final,
+    # one-dimensional tensor can be simply sliced
+    if len(start) == 0:
+        return [()]
+    elif len(start) == 1:
+        return [(slice(start[0], end[0] + 1),)]
+
+    slices: list[tuple[slice, ...]] = []
+    path_list: list[slice] = []
+
+    # Dimensions common to start and end can be selected directly
+    for s, e in zip(start, end):
+        if s == e:
+            path_list.append(slice(s, s + 1))
+        else:
+            break
+
+    path: tuple[slice, ...] = tuple(path_list)
+    divergence_idx = len(path)
+
+    # start == end, and we're done
+    if divergence_idx == len(dims):
+        return [path]
+
+    def upper() -> tuple[tuple[slice, ...], ...]:
+        assert start_edges is not None
+        assert end_edges is not None
+
+        sdi = start[divergence_idx]
+        return tuple(
+            path + (slice(sdi, sdi + 1),) + s
+            for s in _get_minimal_slice_set(
+                start[divergence_idx + 1 :],
+                [d - 1 for d in dims[divergence_idx + 1 :]],
+                dims[divergence_idx + 1 :],
+                start_edges=start_edges[divergence_idx + 1 :],
+                end_edges=[True for _ in end_edges[divergence_idx + 1 :]],
+            )
+        )
+
+    def lower() -> tuple[tuple[slice, ...], ...]:
+        assert start_edges is not None
+        assert end_edges is not None
+
+        edi = end[divergence_idx]
+        return tuple(
+            path + (slice(edi, edi + 1),) + s
+            for s in _get_minimal_slice_set(
+                [0 for _ in start[divergence_idx + 1 :]],
+                end[divergence_idx + 1 :],
+                dims[divergence_idx + 1 :],
+                start_edges=[True for _ in start_edges[divergence_idx + 1 :]],
+                end_edges=end_edges[divergence_idx + 1 :],
+            )
+        )
+
+    # If both start and end are at the edges of the subtree rooted at
+    # divergence_idx, we can just select the whole subtree at once
+    if start_edges[divergence_idx] and end_edges[divergence_idx]:
+        slices.append(path + (slice(start[divergence_idx], end[divergence_idx] + 1),))
+    # If just start is at the edge, we can grab almost all of the subtree,
+    # treating only the ragged bottom edge as an edge case
+    elif start_edges[divergence_idx]:
+        slices.append(path + (slice(start[divergence_idx], end[divergence_idx]),))
+        slices.extend(lower())
+    # Analogous to the previous case, but the top is ragged this time
+    elif end_edges[divergence_idx]:
+        slices.extend(upper())
+        slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),))
+    # If both sides of the range are ragged, we need to handle both sides
+    # separately. If there's contiguous meat in between them, we can index it
+    # in one big chunk
+    else:
+        slices.extend(upper())
+        middle_ground = end[divergence_idx] - start[divergence_idx]
+        if middle_ground > 1:
+            slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx]),))
+        slices.extend(lower())
+
+    return slices
+
+
+@torch.jit.ignore
+def _chunk_slice(t: torch.Tensor, flat_start: int, flat_end: int, no_batch_dims: int) -> torch.Tensor:
+    """
+    Equivalent to
+
+        t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end]
+
+    but without the need for the initial reshape call, which can be memory-intensive in certain situations. The only
+    reshape operations in this function are performed on sub-tensors that scale with (flat_end - flat_start), the chunk
+    size.
+    """
+
+    batch_dims = t.shape[:no_batch_dims]
+    start_idx = list(_flat_idx_to_idx(flat_start, batch_dims))
+    # _get_minimal_slice_set is inclusive
+    end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims))
+
+    # Get an ordered list of slices to perform
+    slices = _get_minimal_slice_set(
+        start_idx,
+        end_idx,
+        batch_dims,
+    )
+
+    sliced_tensors = [t[s] for s in slices]
+
+    return torch.cat([s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors])
+
+
+def chunk_layer(
+    layer: Callable,
+    inputs: dict[str, Any],
+    chunk_size: int,
+    no_batch_dims: int,
+    low_mem: bool = False,
+    _out: Any = None,
+    _add_into_out: bool = False,
+) -> Any:
+    """
+    Implements the "chunking" procedure described in section 1.11.8.
+
+    Layer outputs and inputs are assumed to be simple "pytrees," consisting only of (arbitrarily nested) lists, tuples,
+    and dicts with torch.Tensor leaves.
+
+    Args:
+        layer:
+            The layer to be applied chunk-wise
+        inputs:
+            A (non-nested) dictionary of keyworded inputs. All leaves must be tensors and must share the same batch
+            dimensions.
+        chunk_size:
+            The number of sub-batches per chunk. If multiple batch dimensions are specified, a "sub-batch" is defined
+            as a single indexing of all batch dimensions simultaneously (s.t. the number of sub-batches is the product
+            of the batch dimensions).
+        no_batch_dims:
+            How many of the initial dimensions of each input tensor can be considered batch dimensions.
+        low_mem:
+            Avoids flattening potentially large input tensors. Unnecessary in most cases, and is ever so slightly
+            slower than the default setting.
+    Returns:
+        The reassembled output of the layer on the inputs.
+    """
+    if not (len(inputs) > 0):
+        raise ValueError("Must provide at least one input")
+
+    initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)]
+    orig_batch_dims = tuple(max(s) for s in zip(*initial_dims))
+
+    def _prep_inputs(t: torch.Tensor) -> torch.Tensor:
+        if not low_mem:
+            if sum(t.shape[:no_batch_dims]) != no_batch_dims:
+                t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+            t = t.reshape(-1, *t.shape[no_batch_dims:])
+        else:
+            t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+        return t
+
+    prepped_inputs: dict[str, Any] = tensor_tree_map(_prep_inputs, inputs)
+    prepped_outputs = None
+    if _out is not None:
+        prepped_outputs = tensor_tree_map(lambda t: t.view([-1] + list(t.shape[no_batch_dims:])), _out)
+
+    flat_batch_dim = 1
+    for d in orig_batch_dims:
+        flat_batch_dim *= d
+
+    no_chunks = flat_batch_dim // chunk_size + (flat_batch_dim % chunk_size != 0)
+
+    def _select_chunk(t: torch.Tensor) -> torch.Tensor:
+        return t[i : i + chunk_size] if t.shape[0] != 1 else t
+
+    i = 0
+    out = prepped_outputs
+    for _ in range(no_chunks):
+        # Chunk the input
+        if not low_mem:
+            select_chunk = _select_chunk
+        else:
+            select_chunk = partial(
+                _chunk_slice,
+                flat_start=i,
+                flat_end=min(flat_batch_dim, i + chunk_size),
+                no_batch_dims=len(orig_batch_dims),
+            )
+
+        chunks: dict[str, Any] = tensor_tree_map(select_chunk, prepped_inputs)
+
+        # Run the layer on the chunk
+        output_chunk = layer(**chunks)
+
+        # Allocate space for the output
+        if out is None:
+            out = tensor_tree_map(lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:]), output_chunk)
+
+        # Put the chunk in its pre-allocated space
+        if isinstance(output_chunk, dict):
+
+            def assign(d1: dict, d2: dict) -> None:
+                for k, v in d1.items():
+                    if isinstance(v, dict):
+                        assign(v, d2[k])
+                    else:
+                        if _add_into_out:
+                            v[i : i + chunk_size] += d2[k]
+                        else:
+                            v[i : i + chunk_size] = d2[k]
+
+            assign(out, output_chunk)
+        elif isinstance(output_chunk, tuple):
+            for x1, x2 in zip(out, output_chunk):
+                if _add_into_out:
+                    x1[i : i + chunk_size] += x2
+                else:
+                    x1[i : i + chunk_size] = x2
+        elif isinstance(output_chunk, torch.Tensor):
+            if _add_into_out:
+                out[i : i + chunk_size] += output_chunk
+            else:
+                out[i : i + chunk_size] = output_chunk
+        else:
+            raise TypeError("Not supported")
+
+        i += chunk_size
+
+    out = tensor_tree_map(lambda t: t.view(orig_batch_dims + t.shape[1:]), out)
+
+    return out
+
+
+class ChunkSizeTuner:
+    def __init__(
+        self,
+        # Heuristically, runtimes for most of the modules in the network
+        # plateau earlier than this on all GPUs I've run the model on.
+        max_chunk_size: int = 512,
+    ):
+        self.max_chunk_size = max_chunk_size
+        self.cached_chunk_size: Optional[int] = None
+        self.cached_arg_data: Optional[tuple] = None
+
+    def _determine_favorable_chunk_size(self, fn: Callable, args: tuple, min_chunk_size: int) -> int:
+        logging.info("Tuning chunk size...")
+
+        if min_chunk_size >= self.max_chunk_size:
+            return min_chunk_size
+
+        candidates: list[int] = [2**l for l in range(int(math.log(self.max_chunk_size, 2)) + 1)]
+        candidates = [c for c in candidates if c > min_chunk_size]
+        candidates = [min_chunk_size] + candidates
+        candidates[-1] += 4
+
+        def test_chunk_size(chunk_size: int) -> bool:
+            try:
+                with torch.no_grad():
+                    fn(*args, chunk_size=chunk_size)
+                return True
+            except RuntimeError:
+                return False
+
+        min_viable_chunk_size_index = 0
+        i = len(candidates) - 1
+        while i > min_viable_chunk_size_index:
+            viable = test_chunk_size(candidates[i])
+            if not viable:
+                i = (min_viable_chunk_size_index + i) // 2
+            else:
+                min_viable_chunk_size_index = i
+                i = (i + len(candidates) - 1) // 2
+
+        return candidates[min_viable_chunk_size_index]
+
+    def _compare_arg_caches(self, ac1: Iterable, ac2: Iterable) -> bool:
+        consistent = True
+        for a1, a2 in zip(ac1, ac2):
+            assert type(ac1) is type(ac2)
+            if isinstance(ac1, (list, tuple)):
+                consistent &= self._compare_arg_caches(a1, a2)
+            elif isinstance(ac1, dict):
+                a1_items = [v for _, v in sorted(a1.items(), key=lambda x: x[0])]
+                a2_items = [v for _, v in sorted(a2.items(), key=lambda x: x[0])]
+                consistent &= self._compare_arg_caches(a1_items, a2_items)
+            else:
+                consistent &= a1 == a2
+
+        return consistent
+
+    def tune_chunk_size(
+        self,
+        representative_fn: Callable,
+        args: tuple,
+        min_chunk_size: int,
+    ) -> int:
+        consistent = True
+        arg_data: tuple = tree_map(lambda a: a.shape if isinstance(a, torch.Tensor) else a, args, object)
+        if self.cached_arg_data is not None:
+            # If args have changed shape/value, we need to re-tune
+            assert len(self.cached_arg_data) == len(arg_data)
+            consistent = self._compare_arg_caches(self.cached_arg_data, arg_data)
+        else:
+            # Otherwise, we can reuse the precomputed value
+            consistent = False
+
+        if not consistent:
+            self.cached_chunk_size = self._determine_favorable_chunk_size(
+                representative_fn,
+                args,
+                min_chunk_size,
+            )
+            self.cached_arg_data = arg_data
+
+        assert self.cached_chunk_size is not None
+
+        return self.cached_chunk_size
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/data_transforms.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/data_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcd67aacab8e5ee6d141166d1ec7fa94cba84e6d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/data_transforms.py
@@ -0,0 +1,90 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+import torch
+
+from . import residue_constants as rc
+from .tensor_utils import tensor_tree_map, tree_map
+
+
+def make_atom14_masks(protein: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    """Construct denser atom positions (14 dimensions instead of 37)."""
+    restype_atom14_to_atom37_list = []
+    restype_atom37_to_atom14_list = []
+    restype_atom14_mask_list = []
+
+    for rt in rc.restypes:
+        atom_names = rc.restype_name_to_atom14_names[rc.restype_1to3[rt]]
+        restype_atom14_to_atom37_list.append([(rc.atom_order[name] if name else 0) for name in atom_names])
+        atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)}
+        restype_atom37_to_atom14_list.append([(atom_name_to_idx14.get(name, 0)) for name in rc.atom_types])
+
+        restype_atom14_mask_list.append([(1.0 if name else 0.0) for name in atom_names])
+
+    # Add dummy mapping for restype 'UNK'
+    restype_atom14_to_atom37_list.append([0] * 14)
+    restype_atom37_to_atom14_list.append([0] * 37)
+    restype_atom14_mask_list.append([0.0] * 14)
+
+    restype_atom14_to_atom37 = torch.tensor(
+        restype_atom14_to_atom37_list,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom37_to_atom14 = torch.tensor(
+        restype_atom37_to_atom14_list,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom14_mask = torch.tensor(
+        restype_atom14_mask_list,
+        dtype=torch.float32,
+        device=protein["aatype"].device,
+    )
+    protein_aatype = protein["aatype"].to(torch.long)
+
+    # create the mapping for (residx, atom14) --> atom37, i.e. an array
+    # with shape (num_res, 14) containing the atom37 indices for this protein
+    residx_atom14_to_atom37 = restype_atom14_to_atom37[protein_aatype]
+    residx_atom14_mask = restype_atom14_mask[protein_aatype]
+
+    protein["atom14_atom_exists"] = residx_atom14_mask
+    protein["residx_atom14_to_atom37"] = residx_atom14_to_atom37.long()
+
+    # create the gather indices for mapping back
+    residx_atom37_to_atom14 = restype_atom37_to_atom14[protein_aatype]
+    protein["residx_atom37_to_atom14"] = residx_atom37_to_atom14.long()
+
+    # create the corresponding mask
+    restype_atom37_mask = torch.zeros([21, 37], dtype=torch.float32, device=protein["aatype"].device)
+    for restype, restype_letter in enumerate(rc.restypes):
+        restype_name = rc.restype_1to3[restype_letter]
+        atom_names = rc.residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = rc.atom_order[atom_name]
+            restype_atom37_mask[restype, atom_type] = 1
+
+    residx_atom37_mask = restype_atom37_mask[protein_aatype]
+    protein["atom37_atom_exists"] = residx_atom37_mask
+
+    return protein
+
+
+def make_atom14_masks_np(batch: dict[str, torch.Tensor]) -> dict[str, np.ndarray]:
+    batch = tree_map(lambda n: torch.tensor(n, device=batch["aatype"].device), batch, np.ndarray)
+    out = tensor_tree_map(lambda t: np.array(t), make_atom14_masks(batch))
+    return out
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/feats.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/feats.py
new file mode 100644
index 0000000000000000000000000000000000000000..366f3c47d9466d0c5556b6eb7c365393785a82aa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/feats.py
@@ -0,0 +1,253 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import overload
+
+import torch
+import torch.types
+from torch import nn
+
+from . import residue_constants as rc
+from .rigid_utils import Rigid, Rotation
+from .tensor_utils import batched_gather
+
+
+@overload
+def pseudo_beta_fn(aatype: torch.Tensor, all_atom_positions: torch.Tensor, all_atom_masks: None) -> torch.Tensor: ...
+
+
+@overload
+def pseudo_beta_fn(
+    aatype: torch.Tensor, all_atom_positions: torch.Tensor, all_atom_masks: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]: ...
+
+
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks):
+    is_gly = aatype == rc.restype_order["G"]
+    ca_idx = rc.atom_order["CA"]
+    cb_idx = rc.atom_order["CB"]
+    pseudo_beta = torch.where(
+        is_gly[..., None].expand(*((-1,) * len(is_gly.shape)), 3),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+
+    if all_atom_masks is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly,
+            all_atom_masks[..., ca_idx],
+            all_atom_masks[..., cb_idx],
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+
+
+def atom14_to_atom37(atom14: torch.Tensor, batch: dict[str, torch.Tensor]) -> torch.Tensor:
+    atom37_data = batched_gather(
+        atom14,
+        batch["residx_atom37_to_atom14"],
+        dim=-2,
+        no_batch_dims=len(atom14.shape[:-2]),
+    )
+
+    atom37_data = atom37_data * batch["atom37_atom_exists"][..., None]
+
+    return atom37_data
+
+
+def build_template_angle_feat(template_feats: dict[str, torch.Tensor]) -> torch.Tensor:
+    template_aatype = template_feats["template_aatype"]
+    torsion_angles_sin_cos = template_feats["template_torsion_angles_sin_cos"]
+    alt_torsion_angles_sin_cos = template_feats["template_alt_torsion_angles_sin_cos"]
+    torsion_angles_mask = template_feats["template_torsion_angles_mask"]
+    template_angle_feat = torch.cat(
+        [
+            nn.functional.one_hot(template_aatype, 22),
+            torsion_angles_sin_cos.reshape(*torsion_angles_sin_cos.shape[:-2], 14),
+            alt_torsion_angles_sin_cos.reshape(*alt_torsion_angles_sin_cos.shape[:-2], 14),
+            torsion_angles_mask,
+        ],
+        dim=-1,
+    )
+
+    return template_angle_feat
+
+
+def build_template_pair_feat(
+    batch: dict[str, torch.Tensor],
+    min_bin: torch.types.Number,
+    max_bin: torch.types.Number,
+    no_bins: int,
+    use_unit_vector: bool = False,
+    eps: float = 1e-20,
+    inf: float = 1e8,
+) -> torch.Tensor:
+    template_mask = batch["template_pseudo_beta_mask"]
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+
+    # Compute distogram (this seems to differ slightly from Alg. 5)
+    tpb = batch["template_pseudo_beta"]
+    dgram = torch.sum((tpb[..., None, :] - tpb[..., None, :, :]) ** 2, dim=-1, keepdim=True)
+    lower = torch.linspace(min_bin, max_bin, no_bins, device=tpb.device) ** 2
+    upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
+    dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype)
+
+    to_concat = [dgram, template_mask_2d[..., None]]
+
+    aatype_one_hot: torch.LongTensor = nn.functional.one_hot(
+        batch["template_aatype"],
+        rc.restype_num + 2,
+    )
+
+    n_res = batch["template_aatype"].shape[-1]
+    to_concat.append(aatype_one_hot[..., None, :, :].expand(*aatype_one_hot.shape[:-2], n_res, -1, -1))
+    to_concat.append(aatype_one_hot[..., None, :].expand(*aatype_one_hot.shape[:-2], -1, n_res, -1))
+
+    n, ca, c = [rc.atom_order[a] for a in ["N", "CA", "C"]]
+    rigids = Rigid.make_transform_from_reference(
+        n_xyz=batch["template_all_atom_positions"][..., n, :],
+        ca_xyz=batch["template_all_atom_positions"][..., ca, :],
+        c_xyz=batch["template_all_atom_positions"][..., c, :],
+        eps=eps,
+    )
+    points = rigids.get_trans()[..., None, :, :]
+    rigid_vec = rigids[..., None].invert_apply(points)
+
+    inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec**2, dim=-1))
+
+    t_aa_masks = batch["template_all_atom_mask"]
+    template_mask = t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[..., c]
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+
+    inv_distance_scalar = inv_distance_scalar * template_mask_2d
+    unit_vector = rigid_vec * inv_distance_scalar[..., None]
+
+    if not use_unit_vector:
+        unit_vector = unit_vector * 0.0
+
+    to_concat.extend(torch.unbind(unit_vector[..., None, :], dim=-1))
+    to_concat.append(template_mask_2d[..., None])
+
+    act = torch.cat(to_concat, dim=-1)
+    act = act * template_mask_2d[..., None]
+
+    return act
+
+
+def build_extra_msa_feat(batch: dict[str, torch.Tensor]) -> torch.Tensor:
+    msa_1hot: torch.LongTensor = nn.functional.one_hot(batch["extra_msa"], 23)
+    msa_feat = [
+        msa_1hot,
+        batch["extra_has_deletion"].unsqueeze(-1),
+        batch["extra_deletion_value"].unsqueeze(-1),
+    ]
+    return torch.cat(msa_feat, dim=-1)
+
+
+def torsion_angles_to_frames(
+    r: Rigid,
+    alpha: torch.Tensor,
+    aatype: torch.Tensor,
+    rrgdf: torch.Tensor,
+) -> Rigid:
+    # [*, N, 8, 4, 4]
+    default_4x4 = rrgdf[aatype, ...]
+
+    # [*, N, 8] transformations, i.e.
+    #   One [*, N, 8, 3, 3] rotation matrix and
+    #   One [*, N, 8, 3]    translation matrix
+    default_r = r.from_tensor_4x4(default_4x4)
+
+    bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2))
+    bb_rot[..., 1] = 1
+
+    # [*, N, 8, 2]
+    alpha = torch.cat([bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2)
+
+    # [*, N, 8, 3, 3]
+    # Produces rotation matrices of the form:
+    # [
+    #   [1, 0  , 0  ],
+    #   [0, a_2,-a_1],
+    #   [0, a_1, a_2]
+    # ]
+    # This follows the original code rather than the supplement, which uses
+    # different indices.
+
+    all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape)
+    all_rots[..., 0, 0] = 1
+    all_rots[..., 1, 1] = alpha[..., 1]
+    all_rots[..., 1, 2] = -alpha[..., 0]
+    all_rots[..., 2, 1:] = alpha
+
+    all_frames = default_r.compose(Rigid(Rotation(rot_mats=all_rots), None))
+
+    chi2_frame_to_frame = all_frames[..., 5]
+    chi3_frame_to_frame = all_frames[..., 6]
+    chi4_frame_to_frame = all_frames[..., 7]
+
+    chi1_frame_to_bb = all_frames[..., 4]
+    chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame)
+    chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame)
+    chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame)
+
+    all_frames_to_bb = Rigid.cat(
+        [
+            all_frames[..., :5],
+            chi2_frame_to_bb.unsqueeze(-1),
+            chi3_frame_to_bb.unsqueeze(-1),
+            chi4_frame_to_bb.unsqueeze(-1),
+        ],
+        dim=-1,
+    )
+
+    all_frames_to_global = r[..., None].compose(all_frames_to_bb)
+
+    return all_frames_to_global
+
+
+def frames_and_literature_positions_to_atom14_pos(
+    r: Rigid,
+    aatype: torch.Tensor,
+    default_frames: torch.Tensor,
+    group_idx: torch.Tensor,
+    atom_mask: torch.Tensor,
+    lit_positions: torch.Tensor,
+) -> torch.Tensor:
+    # [*, N, 14]
+    group_mask = group_idx[aatype, ...]
+
+    # [*, N, 14, 8]
+    group_mask_one_hot: torch.LongTensor = nn.functional.one_hot(
+        group_mask,
+        num_classes=default_frames.shape[-3],
+    )
+
+    # [*, N, 14, 8]
+    t_atoms_to_global = r[..., None, :] * group_mask_one_hot
+
+    # [*, N, 14]
+    t_atoms_to_global = t_atoms_to_global.map_tensor_fn(lambda x: torch.sum(x, dim=-1))
+
+    # [*, N, 14, 1]
+    atom_mask = atom_mask[aatype, ...].unsqueeze(-1)
+
+    # [*, N, 14, 3]
+    lit_positions = lit_positions[aatype, ...]
+    pred_positions = t_atoms_to_global.apply(lit_positions)
+    pred_positions = pred_positions * atom_mask
+
+    return pred_positions
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/loss.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cd9f8c6b42f3091b05bcb2fb6550c36b40e9943
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/loss.py
@@ -0,0 +1,105 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+
+
+def _calculate_bin_centers(boundaries: torch.Tensor) -> torch.Tensor:
+    step = boundaries[1] - boundaries[0]
+    bin_centers = boundaries + step / 2
+    bin_centers = torch.cat([bin_centers, (bin_centers[-1] + step).unsqueeze(-1)], dim=0)
+    return bin_centers
+
+
+def _calculate_expected_aligned_error(
+    alignment_confidence_breaks: torch.Tensor,
+    aligned_distance_error_probs: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    bin_centers = _calculate_bin_centers(alignment_confidence_breaks)
+    return (
+        torch.sum(aligned_distance_error_probs * bin_centers, dim=-1),
+        bin_centers[-1],
+    )
+
+
+def compute_predicted_aligned_error(
+    logits: torch.Tensor,
+    max_bin: int = 31,
+    no_bins: int = 64,
+    **kwargs,
+) -> dict[str, torch.Tensor]:
+    """Computes aligned confidence metrics from logits.
+
+    Args:
+      logits: [*, num_res, num_res, num_bins] the logits output from
+        PredictedAlignedErrorHead.
+      max_bin: Maximum bin value
+      no_bins: Number of bins
+    Returns:
+      aligned_confidence_probs: [*, num_res, num_res, num_bins] the predicted
+        aligned error probabilities over bins for each residue pair.
+      predicted_aligned_error: [*, num_res, num_res] the expected aligned distance
+        error for each pair of residues.
+      max_predicted_aligned_error: [*] the maximum predicted error possible.
+    """
+    boundaries = torch.linspace(0, max_bin, steps=(no_bins - 1), device=logits.device)
+
+    aligned_confidence_probs = torch.nn.functional.softmax(logits, dim=-1)
+    predicted_aligned_error, max_predicted_aligned_error = _calculate_expected_aligned_error(
+        alignment_confidence_breaks=boundaries,
+        aligned_distance_error_probs=aligned_confidence_probs,
+    )
+
+    return {
+        "aligned_confidence_probs": aligned_confidence_probs,
+        "predicted_aligned_error": predicted_aligned_error,
+        "max_predicted_aligned_error": max_predicted_aligned_error,
+    }
+
+
+def compute_tm(
+    logits: torch.Tensor,
+    residue_weights: Optional[torch.Tensor] = None,
+    max_bin: int = 31,
+    no_bins: int = 64,
+    eps: float = 1e-8,
+    **kwargs,
+) -> torch.Tensor:
+    if residue_weights is None:
+        residue_weights = logits.new_ones(logits.shape[-2])
+
+    boundaries = torch.linspace(0, max_bin, steps=(no_bins - 1), device=logits.device)
+
+    bin_centers = _calculate_bin_centers(boundaries)
+    torch.sum(residue_weights)
+    n = logits.shape[-2]
+    clipped_n = max(n, 19)
+
+    d0 = 1.24 * (clipped_n - 15) ** (1.0 / 3) - 1.8
+
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+
+    tm_per_bin = 1.0 / (1 + (bin_centers**2) / (d0**2))
+    predicted_tm_term = torch.sum(probs * tm_per_bin, dim=-1)
+
+    normed_residue_mask = residue_weights / (eps + residue_weights.sum())
+    per_alignment = torch.sum(predicted_tm_term * normed_residue_mask, dim=-1)
+
+    weighted = per_alignment * residue_weights
+
+    argmax = (weighted == torch.max(weighted)).nonzero()[0]
+    return per_alignment[tuple(argmax)]
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/protein.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/protein.py
new file mode 100644
index 0000000000000000000000000000000000000000..a943eb7acf72edf09974c250f21d74361b26484d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/protein.py
@@ -0,0 +1,331 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Protein data type."""
+
+import dataclasses
+import re
+import string
+from collections.abc import Iterator, Mapping, Sequence
+from typing import Any, Optional
+
+import numpy as np
+
+from . import residue_constants
+
+
+FeatureDict = Mapping[str, np.ndarray]
+ModelOutput = Mapping[str, Any]  # Is a nested dict.
+PICO_TO_ANGSTROM = 0.01
+
+
+@dataclasses.dataclass(frozen=True)
+class Protein:
+    """Protein structure representation."""
+
+    # Cartesian coordinates of atoms in angstroms. The atom types correspond to
+    # residue_constants.atom_types, i.e. the first three are N, CA, CB.
+    atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
+
+    # Amino-acid type for each residue represented as an integer between 0 and
+    # 20, where 20 is 'X'.
+    aatype: np.ndarray  # [num_res]
+
+    # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
+    # is present and 0.0 if not. This should be used for loss masking.
+    atom_mask: np.ndarray  # [num_res, num_atom_type]
+
+    # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
+    residue_index: np.ndarray  # [num_res]
+
+    # B-factors, or temperature factors, of each residue (in sq. angstroms units),
+    # representing the displacement of the residue from its ground truth mean
+    # value.
+    b_factors: np.ndarray  # [num_res, num_atom_type]
+
+    # Chain indices for multi-chain predictions
+    chain_index: Optional[np.ndarray] = None
+
+    # Optional remark about the protein. Included as a comment in output PDB
+    # files
+    remark: Optional[str] = None
+
+    # Templates used to generate this protein (prediction-only)
+    parents: Optional[Sequence[str]] = None
+
+    # Chain corresponding to each parent
+    parents_chain_index: Optional[Sequence[int]] = None
+
+
+def from_proteinnet_string(proteinnet_str: str) -> Protein:
+    tag_re = r"(\[[A-Z]+\]\n)"
+    tags: list[str] = [tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0]
+    groups: Iterator[tuple[str, list[str]]] = zip(tags[0::2], [l.split("\n") for l in tags[1::2]])
+
+    atoms: list[str] = ["N", "CA", "C"]
+    aatype = None
+    atom_positions = None
+    atom_mask = None
+    for g in groups:
+        if "[PRIMARY]" == g[0]:
+            seq = g[1][0].strip()
+            for i in range(len(seq)):
+                if seq[i] not in residue_constants.restypes:
+                    seq[i] = "X"  # FIXME: strings are immutable
+            aatype = np.array(
+                [residue_constants.restype_order.get(res_symbol, residue_constants.restype_num) for res_symbol in seq]
+            )
+        elif "[TERTIARY]" == g[0]:
+            tertiary: list[list[float]] = []
+            for axis in range(3):
+                tertiary.append(list(map(float, g[1][axis].split())))
+            tertiary_np = np.array(tertiary)
+            atom_positions = np.zeros((len(tertiary[0]) // 3, residue_constants.atom_type_num, 3)).astype(np.float32)
+            for i, atom in enumerate(atoms):
+                atom_positions[:, residue_constants.atom_order[atom], :] = np.transpose(tertiary_np[:, i::3])
+            atom_positions *= PICO_TO_ANGSTROM
+        elif "[MASK]" == g[0]:
+            mask = np.array(list(map({"-": 0, "+": 1}.get, g[1][0].strip())))
+            atom_mask = np.zeros(
+                (
+                    len(mask),
+                    residue_constants.atom_type_num,
+                )
+            ).astype(np.float32)
+            for i, atom in enumerate(atoms):
+                atom_mask[:, residue_constants.atom_order[atom]] = 1
+            atom_mask *= mask[..., None]
+
+    assert aatype is not None
+
+    return Protein(
+        atom_positions=atom_positions,
+        atom_mask=atom_mask,
+        aatype=aatype,
+        residue_index=np.arange(len(aatype)),
+        b_factors=None,
+    )
+
+
+def get_pdb_headers(prot: Protein, chain_id: int = 0) -> list[str]:
+    pdb_headers: list[str] = []
+
+    remark = prot.remark
+    if remark is not None:
+        pdb_headers.append(f"REMARK {remark}")
+
+    parents = prot.parents
+    parents_chain_index = prot.parents_chain_index
+    if parents is not None and parents_chain_index is not None:
+        parents = [p for i, p in zip(parents_chain_index, parents) if i == chain_id]
+
+    if parents is None or len(parents) == 0:
+        parents = ["N/A"]
+
+    pdb_headers.append(f"PARENT {' '.join(parents)}")
+
+    return pdb_headers
+
+
+def add_pdb_headers(prot: Protein, pdb_str: str) -> str:
+    """Add pdb headers to an existing PDB string. Useful during multi-chain
+    recycling
+    """
+    out_pdb_lines: list[str] = []
+    lines = pdb_str.split("\n")
+
+    remark = prot.remark
+    if remark is not None:
+        out_pdb_lines.append(f"REMARK {remark}")
+
+    parents_per_chain: list[list[str]]
+    if prot.parents is not None and len(prot.parents) > 0:
+        parents_per_chain = []
+        if prot.parents_chain_index is not None:
+            parent_dict: dict[str, list[str]] = {}
+            for p, i in zip(prot.parents, prot.parents_chain_index):
+                parent_dict.setdefault(str(i), [])
+                parent_dict[str(i)].append(p)
+
+            max_idx = max([int(chain_idx) for chain_idx in parent_dict])
+            for i in range(max_idx + 1):
+                chain_parents = parent_dict.get(str(i), ["N/A"])
+                parents_per_chain.append(chain_parents)
+        else:
+            parents_per_chain.append(list(prot.parents))
+    else:
+        parents_per_chain = [["N/A"]]
+
+    def make_parent_line(p: Sequence[str]) -> str:
+        return f"PARENT {' '.join(p)}"
+
+    out_pdb_lines.append(make_parent_line(parents_per_chain[0]))
+
+    chain_counter = 0
+    for i, l in enumerate(lines):
+        if "PARENT" not in l and "REMARK" not in l:
+            out_pdb_lines.append(l)
+        if "TER" in l and "END" not in lines[i + 1]:
+            chain_counter += 1
+            if not chain_counter >= len(parents_per_chain):
+                chain_parents = parents_per_chain[chain_counter]
+            else:
+                chain_parents = ["N/A"]
+
+            out_pdb_lines.append(make_parent_line(chain_parents))
+
+    return "\n".join(out_pdb_lines)
+
+
+def to_pdb(prot: Protein) -> str:
+    """Converts a `Protein` instance to a PDB string.
+
+    Args:
+      prot: The protein to convert to PDB.
+
+    Returns:
+      PDB string.
+    """
+    restypes = residue_constants.restypes + ["X"]
+
+    def res_1to3(r: int) -> str:
+        return residue_constants.restype_1to3.get(restypes[r], "UNK")
+
+    atom_types = residue_constants.atom_types
+
+    pdb_lines: list[str] = []
+
+    atom_mask = prot.atom_mask
+    aatype = prot.aatype
+    atom_positions = prot.atom_positions
+    residue_index = prot.residue_index.astype(np.int32)
+    b_factors = prot.b_factors
+    chain_index = prot.chain_index
+
+    if np.any(aatype > residue_constants.restype_num):
+        raise ValueError("Invalid aatypes.")
+
+    headers = get_pdb_headers(prot)
+    if len(headers) > 0:
+        pdb_lines.extend(headers)
+
+    n = aatype.shape[0]
+    atom_index = 1
+    prev_chain_index = 0
+    chain_tags = string.ascii_uppercase
+    chain_tag = None
+    # Add all atom sites.
+    for i in range(n):
+        res_name_3 = res_1to3(aatype[i])
+        for atom_name, pos, mask, b_factor in zip(atom_types, atom_positions[i], atom_mask[i], b_factors[i]):
+            if mask < 0.5:
+                continue
+
+            record_type = "ATOM"
+            name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+            alt_loc = ""
+            insertion_code = ""
+            occupancy = 1.00
+            element = atom_name[0]  # Protein supports only C, N, O, S, this works.
+            charge = ""
+
+            chain_tag = "A"
+            if chain_index is not None:
+                chain_tag = chain_tags[chain_index[i]]
+
+            # PDB is a columnar format, every space matters here!
+            atom_line = (
+                f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}"
+                f"{res_name_3:>3} {chain_tag:>1}"
+                f"{residue_index[i]:>4}{insertion_code:>1}   "
+                f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                f"{element:>2}{charge:>2}"
+            )
+            pdb_lines.append(atom_line)
+            atom_index += 1
+
+        should_terminate = i == n - 1
+        if chain_index is not None:
+            if i != n - 1 and chain_index[i + 1] != prev_chain_index:
+                should_terminate = True
+                prev_chain_index = chain_index[i + 1]
+
+        if should_terminate:
+            # Close the chain.
+            chain_end = "TER"
+            chain_termination_line = (
+                f"{chain_end:<6}{atom_index:>5}      {res_1to3(aatype[i]):>3} {chain_tag:>1}{residue_index[i]:>4}"
+            )
+            pdb_lines.append(chain_termination_line)
+            atom_index += 1
+
+            if i != n - 1:
+                # "prev" is a misnomer here. This happens at the beginning of
+                # each new chain.
+                pdb_lines.extend(get_pdb_headers(prot, prev_chain_index))
+
+    pdb_lines.append("END")
+    pdb_lines.append("")
+    return "\n".join(pdb_lines)
+
+
+def ideal_atom_mask(prot: Protein) -> np.ndarray:
+    """Computes an ideal atom mask.
+
+    `Protein.atom_mask` typically is defined according to the atoms that are reported in the PDB. This function
+    computes a mask according to heavy atoms that should be present in the given sequence of amino acids.
+
+    Args:
+      prot: `Protein` whose fields are `numpy.ndarray` objects.
+
+    Returns:
+      An ideal atom mask.
+    """
+    return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
+
+
+def from_prediction(
+    features: FeatureDict,
+    result: ModelOutput,
+    b_factors: Optional[np.ndarray] = None,
+    chain_index: Optional[np.ndarray] = None,
+    remark: Optional[str] = None,
+    parents: Optional[Sequence[str]] = None,
+    parents_chain_index: Optional[Sequence[int]] = None,
+) -> Protein:
+    """Assembles a protein from a prediction.
+
+    Args:
+      features: Dictionary holding model inputs.
+      result: Dictionary holding model outputs.
+      b_factors: (Optional) B-factors to use for the protein.
+      chain_index: (Optional) Chain indices for multi-chain predictions
+      remark: (Optional) Remark about the prediction
+      parents: (Optional) List of template names
+    Returns:
+      A protein instance.
+    """
+    return Protein(
+        aatype=features["aatype"],
+        atom_positions=result["final_atom_positions"],
+        atom_mask=result["final_atom_mask"],
+        residue_index=features["residue_index"] + 1,
+        b_factors=b_factors if b_factors is not None else np.zeros_like(result["final_atom_mask"]),
+        chain_index=chain_index,
+        remark=remark,
+        parents=parents,
+        parents_chain_index=parents_chain_index,
+    )
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/residue_constants.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/residue_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..e92e65d29bfb2e7a0ad640e71d983c1c467d3a54
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/residue_constants.py
@@ -0,0 +1,979 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Constants used in AlphaFold."""
+
+import collections
+import copy
+import functools
+from collections.abc import Mapping, Sequence
+from importlib import resources
+
+import numpy as np
+
+
+# Internal import (35fd).
+
+
+# Distance from one CA to next CA [trans configuration: omega = 180].
+ca_ca = 3.80209737096
+
+# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
+# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
+# chi angles so their chi angle lists are empty.
+chi_angles_atoms: dict[str, list[list[str]]] = {
+    "ALA": [],
+    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
+    "ARG": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "NE"], ["CG", "CD", "NE", "CZ"]],
+    "ASN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "ASP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "CYS": [["N", "CA", "CB", "SG"]],
+    "GLN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]],
+    "GLU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]],
+    "GLY": [],
+    "HIS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "ND1"]],
+    "ILE": [["N", "CA", "CB", "CG1"], ["CA", "CB", "CG1", "CD1"]],
+    "LEU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "LYS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "CE"], ["CG", "CD", "CE", "NZ"]],
+    "MET": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "SD"], ["CB", "CG", "SD", "CE"]],
+    "PHE": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "PRO": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"]],
+    "SER": [["N", "CA", "CB", "OG"]],
+    "THR": [["N", "CA", "CB", "OG1"]],
+    "TRP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "TYR": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "VAL": [["N", "CA", "CB", "CG1"]],
+}
+
+# If chi angles given in fixed-length array, this matrix determines how to mask
+# them for each AA type. The order is as per restype_order (see below).
+chi_angles_mask: list[list[float]] = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [1.0, 1.0, 1.0, 1.0],  # ARG
+    [1.0, 1.0, 0.0, 0.0],  # ASN
+    [1.0, 1.0, 0.0, 0.0],  # ASP
+    [1.0, 0.0, 0.0, 0.0],  # CYS
+    [1.0, 1.0, 1.0, 0.0],  # GLN
+    [1.0, 1.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [1.0, 1.0, 0.0, 0.0],  # HIS
+    [1.0, 1.0, 0.0, 0.0],  # ILE
+    [1.0, 1.0, 0.0, 0.0],  # LEU
+    [1.0, 1.0, 1.0, 1.0],  # LYS
+    [1.0, 1.0, 1.0, 0.0],  # MET
+    [1.0, 1.0, 0.0, 0.0],  # PHE
+    [1.0, 1.0, 0.0, 0.0],  # PRO
+    [1.0, 0.0, 0.0, 0.0],  # SER
+    [1.0, 0.0, 0.0, 0.0],  # THR
+    [1.0, 1.0, 0.0, 0.0],  # TRP
+    [1.0, 1.0, 0.0, 0.0],  # TYR
+    [1.0, 0.0, 0.0, 0.0],  # VAL
+]
+
+# The following chi angles are pi periodic: they can be rotated by a multiple
+# of pi without affecting the structure.
+chi_pi_periodic: list[list[float]] = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [0.0, 0.0, 0.0, 0.0],  # ARG
+    [0.0, 0.0, 0.0, 0.0],  # ASN
+    [0.0, 1.0, 0.0, 0.0],  # ASP
+    [0.0, 0.0, 0.0, 0.0],  # CYS
+    [0.0, 0.0, 0.0, 0.0],  # GLN
+    [0.0, 0.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [0.0, 0.0, 0.0, 0.0],  # HIS
+    [0.0, 0.0, 0.0, 0.0],  # ILE
+    [0.0, 0.0, 0.0, 0.0],  # LEU
+    [0.0, 0.0, 0.0, 0.0],  # LYS
+    [0.0, 0.0, 0.0, 0.0],  # MET
+    [0.0, 1.0, 0.0, 0.0],  # PHE
+    [0.0, 0.0, 0.0, 0.0],  # PRO
+    [0.0, 0.0, 0.0, 0.0],  # SER
+    [0.0, 0.0, 0.0, 0.0],  # THR
+    [0.0, 0.0, 0.0, 0.0],  # TRP
+    [0.0, 1.0, 0.0, 0.0],  # TYR
+    [0.0, 0.0, 0.0, 0.0],  # VAL
+    [0.0, 0.0, 0.0, 0.0],  # UNK
+]
+
+# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
+# psi and chi angles:
+# 0: 'backbone group',
+# 1: 'pre-omega-group', (empty)
+# 2: 'phi-group', (currently empty, because it defines only hydrogens)
+# 3: 'psi-group',
+# 4,5,6,7: 'chi1,2,3,4-group'
+# The atom positions are relative to the axis-end-atom of the corresponding
+# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
+# is defined such that the dihedral-angle-definiting atom (the last entry in
+# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
+# format: [atomname, group_idx, rel_position]
+rigid_group_atom_positions: dict[str, list[tuple[str, int, tuple[float, float, float]]]] = {
+    "ALA": [
+        ("N", 0, (-0.525, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, -0.000, -0.000)),
+        ("CB", 0, (-0.529, -0.774, -1.205)),
+        ("O", 3, (0.627, 1.062, 0.000)),
+    ],
+    "ARG": [
+        ("N", 0, (-0.524, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, -0.000)),
+        ("CB", 0, (-0.524, -0.778, -1.209)),
+        ("O", 3, (0.626, 1.062, 0.000)),
+        ("CG", 4, (0.616, 1.390, -0.000)),
+        ("CD", 5, (0.564, 1.414, 0.000)),
+        ("NE", 6, (0.539, 1.357, -0.000)),
+        ("NH1", 7, (0.206, 2.301, 0.000)),
+        ("NH2", 7, (2.078, 0.978, -0.000)),
+        ("CZ", 7, (0.758, 1.093, -0.000)),
+    ],
+    "ASN": [
+        ("N", 0, (-0.536, 1.357, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, -0.000, -0.000)),
+        ("CB", 0, (-0.531, -0.787, -1.200)),
+        ("O", 3, (0.625, 1.062, 0.000)),
+        ("CG", 4, (0.584, 1.399, 0.000)),
+        ("ND2", 5, (0.593, -1.188, 0.001)),
+        ("OD1", 5, (0.633, 1.059, 0.000)),
+    ],
+    "ASP": [
+        ("N", 0, (-0.525, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, 0.000, -0.000)),
+        ("CB", 0, (-0.526, -0.778, -1.208)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.593, 1.398, -0.000)),
+        ("OD1", 5, (0.610, 1.091, 0.000)),
+        ("OD2", 5, (0.592, -1.101, -0.003)),
+    ],
+    "CYS": [
+        ("N", 0, (-0.522, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.524, 0.000, 0.000)),
+        ("CB", 0, (-0.519, -0.773, -1.212)),
+        ("O", 3, (0.625, 1.062, -0.000)),
+        ("SG", 4, (0.728, 1.653, 0.000)),
+    ],
+    "GLN": [
+        ("N", 0, (-0.526, 1.361, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, 0.000, 0.000)),
+        ("CB", 0, (-0.525, -0.779, -1.207)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.615, 1.393, 0.000)),
+        ("CD", 5, (0.587, 1.399, -0.000)),
+        ("NE2", 6, (0.593, -1.189, -0.001)),
+        ("OE1", 6, (0.634, 1.060, 0.000)),
+    ],
+    "GLU": [
+        ("N", 0, (-0.528, 1.361, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, -0.000, -0.000)),
+        ("CB", 0, (-0.526, -0.781, -1.207)),
+        ("O", 3, (0.626, 1.062, 0.000)),
+        ("CG", 4, (0.615, 1.392, 0.000)),
+        ("CD", 5, (0.600, 1.397, 0.000)),
+        ("OE1", 6, (0.607, 1.095, -0.000)),
+        ("OE2", 6, (0.589, -1.104, -0.001)),
+    ],
+    "GLY": [
+        ("N", 0, (-0.572, 1.337, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.517, -0.000, -0.000)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+    ],
+    "HIS": [
+        ("N", 0, (-0.527, 1.360, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, 0.000, 0.000)),
+        ("CB", 0, (-0.525, -0.778, -1.208)),
+        ("O", 3, (0.625, 1.063, 0.000)),
+        ("CG", 4, (0.600, 1.370, -0.000)),
+        ("CD2", 5, (0.889, -1.021, 0.003)),
+        ("ND1", 5, (0.744, 1.160, -0.000)),
+        ("CE1", 5, (2.030, 0.851, 0.002)),
+        ("NE2", 5, (2.145, -0.466, 0.004)),
+    ],
+    "ILE": [
+        ("N", 0, (-0.493, 1.373, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, -0.000, -0.000)),
+        ("CB", 0, (-0.536, -0.793, -1.213)),
+        ("O", 3, (0.627, 1.062, -0.000)),
+        ("CG1", 4, (0.534, 1.437, -0.000)),
+        ("CG2", 4, (0.540, -0.785, -1.199)),
+        ("CD1", 5, (0.619, 1.391, 0.000)),
+    ],
+    "LEU": [
+        ("N", 0, (-0.520, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, -0.000)),
+        ("CB", 0, (-0.522, -0.773, -1.214)),
+        ("O", 3, (0.625, 1.063, -0.000)),
+        ("CG", 4, (0.678, 1.371, 0.000)),
+        ("CD1", 5, (0.530, 1.430, -0.000)),
+        ("CD2", 5, (0.535, -0.774, 1.200)),
+    ],
+    "LYS": [
+        ("N", 0, (-0.526, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, 0.000, 0.000)),
+        ("CB", 0, (-0.524, -0.778, -1.208)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.619, 1.390, 0.000)),
+        ("CD", 5, (0.559, 1.417, 0.000)),
+        ("CE", 6, (0.560, 1.416, 0.000)),
+        ("NZ", 7, (0.554, 1.387, 0.000)),
+    ],
+    "MET": [
+        ("N", 0, (-0.521, 1.364, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, 0.000, 0.000)),
+        ("CB", 0, (-0.523, -0.776, -1.210)),
+        ("O", 3, (0.625, 1.062, -0.000)),
+        ("CG", 4, (0.613, 1.391, -0.000)),
+        ("SD", 5, (0.703, 1.695, 0.000)),
+        ("CE", 6, (0.320, 1.786, -0.000)),
+    ],
+    "PHE": [
+        ("N", 0, (-0.518, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.524, 0.000, -0.000)),
+        ("CB", 0, (-0.525, -0.776, -1.212)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.607, 1.377, 0.000)),
+        ("CD1", 5, (0.709, 1.195, -0.000)),
+        ("CD2", 5, (0.706, -1.196, 0.000)),
+        ("CE1", 5, (2.102, 1.198, -0.000)),
+        ("CE2", 5, (2.098, -1.201, -0.000)),
+        ("CZ", 5, (2.794, -0.003, -0.001)),
+    ],
+    "PRO": [
+        ("N", 0, (-0.566, 1.351, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, -0.000, 0.000)),
+        ("CB", 0, (-0.546, -0.611, -1.293)),
+        ("O", 3, (0.621, 1.066, 0.000)),
+        ("CG", 4, (0.382, 1.445, 0.0)),
+        # ('CD', 5, (0.427, 1.440, 0.0)),
+        ("CD", 5, (0.477, 1.424, 0.0)),  # manually made angle 2 degrees larger
+    ],
+    "SER": [
+        ("N", 0, (-0.529, 1.360, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, -0.000)),
+        ("CB", 0, (-0.518, -0.777, -1.211)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("OG", 4, (0.503, 1.325, 0.000)),
+    ],
+    "THR": [
+        ("N", 0, (-0.517, 1.364, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, 0.000, -0.000)),
+        ("CB", 0, (-0.516, -0.793, -1.215)),
+        ("O", 3, (0.626, 1.062, 0.000)),
+        ("CG2", 4, (0.550, -0.718, -1.228)),
+        ("OG1", 4, (0.472, 1.353, 0.000)),
+    ],
+    "TRP": [
+        ("N", 0, (-0.521, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, 0.000)),
+        ("CB", 0, (-0.523, -0.776, -1.212)),
+        ("O", 3, (0.627, 1.062, 0.000)),
+        ("CG", 4, (0.609, 1.370, -0.000)),
+        ("CD1", 5, (0.824, 1.091, 0.000)),
+        ("CD2", 5, (0.854, -1.148, -0.005)),
+        ("CE2", 5, (2.186, -0.678, -0.007)),
+        ("CE3", 5, (0.622, -2.530, -0.007)),
+        ("NE1", 5, (2.140, 0.690, -0.004)),
+        ("CH2", 5, (3.028, -2.890, -0.013)),
+        ("CZ2", 5, (3.283, -1.543, -0.011)),
+        ("CZ3", 5, (1.715, -3.389, -0.011)),
+    ],
+    "TYR": [
+        ("N", 0, (-0.522, 1.362, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.524, -0.000, -0.000)),
+        ("CB", 0, (-0.522, -0.776, -1.213)),
+        ("O", 3, (0.627, 1.062, -0.000)),
+        ("CG", 4, (0.607, 1.382, -0.000)),
+        ("CD1", 5, (0.716, 1.195, -0.000)),
+        ("CD2", 5, (0.713, -1.194, -0.001)),
+        ("CE1", 5, (2.107, 1.200, -0.002)),
+        ("CE2", 5, (2.104, -1.201, -0.003)),
+        ("OH", 5, (4.168, -0.002, -0.005)),
+        ("CZ", 5, (2.791, -0.001, -0.003)),
+    ],
+    "VAL": [
+        ("N", 0, (-0.494, 1.373, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, -0.000, -0.000)),
+        ("CB", 0, (-0.533, -0.795, -1.213)),
+        ("O", 3, (0.627, 1.062, -0.000)),
+        ("CG1", 4, (0.540, 1.429, -0.000)),
+        ("CG2", 4, (0.533, -0.776, 1.203)),
+    ],
+}
+
+# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
+residue_atoms: dict[str, list[str]] = {
+    "ALA": ["C", "CA", "CB", "N", "O"],
+    "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"],
+    "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"],
+    "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"],
+    "CYS": ["C", "CA", "CB", "N", "O", "SG"],
+    "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"],
+    "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"],
+    "GLY": ["C", "CA", "N", "O"],
+    "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"],
+    "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"],
+    "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"],
+    "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"],
+    "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"],
+    "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"],
+    "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"],
+    "SER": ["C", "CA", "CB", "N", "O", "OG"],
+    "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"],
+    "TRP": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "CZ2", "CZ3", "CH2", "N", "NE1", "O"],
+    "TYR": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O", "OH"],
+    "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"],
+}
+
+# Naming swaps for ambiguous atom names.
+# Due to symmetries in the amino acids the naming of atoms is ambiguous in
+# 4 of the 20 amino acids.
+# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
+# in LEU, VAL and ARG can be resolved by using the 3d constellations of
+# the 'ambiguous' atoms and their neighbours)
+# TODO: ^ interpret this
+residue_atom_renaming_swaps: dict[str, dict[str, str]] = {
+    "ASP": {"OD1": "OD2"},
+    "GLU": {"OE1": "OE2"},
+    "PHE": {"CD1": "CD2", "CE1": "CE2"},
+    "TYR": {"CD1": "CD2", "CE1": "CE2"},
+}
+
+# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
+van_der_waals_radius: dict[str, float] = {
+    "C": 1.7,
+    "N": 1.55,
+    "O": 1.52,
+    "S": 1.8,
+}
+
+Bond = collections.namedtuple("Bond", ["atom1_name", "atom2_name", "length", "stddev"])
+BondAngle = collections.namedtuple(
+    "BondAngle",
+    ["atom1_name", "atom2_name", "atom3name", "angle_rad", "stddev"],
+)
+
+
+def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> list:
+    # Maps strings in a nested list structure to their corresponding index in atom_order
+    if first_call:
+        in_list = copy.deepcopy(in_list)
+    for i in range(len(in_list)):
+        if isinstance(in_list[i], list):
+            in_list[i] = map_structure_with_atom_order(in_list[i], first_call=False)
+        elif isinstance(in_list[i], str):
+            in_list[i] = atom_order[in_list[i]]
+        else:
+            raise TypeError("Unexpected type when mapping nested lists!")
+    return in_list
+
+
+@functools.cache
+def load_stereo_chemical_props() -> tuple[
+    Mapping[str, list[Bond]],
+    Mapping[str, list[Bond]],
+    Mapping[str, list[BondAngle]],
+]:
+    """Load stereo_chemical_props.txt into a nice structure.
+
+    Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
+    edge of the triangle ("residue_virtual_bonds").
+
+    Returns:
+      residue_bonds: dict that maps resname --> list of Bond tuples residue_virtual_bonds: dict that maps resname -->
+      list of Bond tuples residue_bond_angles: dict that maps resname --> list of BondAngle tuples
+    """
+    # TODO: this file should be downloaded in a setup script
+    stereo_chemical_props = resources.read_text("openfold.resources", "stereo_chemical_props.txt")
+
+    lines_iter = iter(stereo_chemical_props.splitlines())
+    # Load bond lengths.
+    residue_bonds: dict[str, list[Bond]] = {}
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, bond_length, stddev = line.split()
+        atom1, atom2 = bond.split("-")
+        if resname not in residue_bonds:
+            residue_bonds[resname] = []
+        residue_bonds[resname].append(Bond(atom1, atom2, float(bond_length), float(stddev)))
+    residue_bonds["UNK"] = []
+
+    # Load bond angles.
+    residue_bond_angles: dict[str, list[BondAngle]] = {}
+    next(lines_iter)  # Skip empty line.
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, angle_degree, stddev_degree = line.split()
+        atom1, atom2, atom3 = bond.split("-")
+        if resname not in residue_bond_angles:
+            residue_bond_angles[resname] = []
+        residue_bond_angles[resname].append(
+            BondAngle(
+                atom1,
+                atom2,
+                atom3,
+                float(angle_degree) / 180.0 * np.pi,
+                float(stddev_degree) / 180.0 * np.pi,
+            )
+        )
+    residue_bond_angles["UNK"] = []
+
+    def make_bond_key(atom1_name: str, atom2_name: str) -> str:
+        """Unique key to lookup bonds."""
+        return "-".join(sorted([atom1_name, atom2_name]))
+
+    # Translate bond angles into distances ("virtual bonds").
+    residue_virtual_bonds: dict[str, list[Bond]] = {}
+    for resname, bond_angles in residue_bond_angles.items():
+        # Create a fast lookup dict for bond lengths.
+        bond_cache: dict[str, Bond] = {}
+        for b in residue_bonds[resname]:
+            bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
+        residue_virtual_bonds[resname] = []
+        for ba in bond_angles:
+            bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
+            bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
+
+            # Compute distance between atom1 and atom3 using the law of cosines
+            # c^2 = a^2 + b^2 - 2ab*cos(gamma).
+            gamma = ba.angle_rad
+            length = np.sqrt(bond1.length**2 + bond2.length**2 - 2 * bond1.length * bond2.length * np.cos(gamma))
+
+            # Propagation of uncertainty assuming uncorrelated errors.
+            dl_outer = 0.5 / length
+            dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
+            dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
+            dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
+            stddev = np.sqrt(
+                (dl_dgamma * ba.stddev) ** 2 + (dl_db1 * bond1.stddev) ** 2 + (dl_db2 * bond2.stddev) ** 2
+            )
+            residue_virtual_bonds[resname].append(Bond(ba.atom1_name, ba.atom3name, length, stddev))
+
+    return (residue_bonds, residue_virtual_bonds, residue_bond_angles)
+
+
+# Between-residue bond lengths for general bonds (first element) and for Proline
+# (second element).
+between_res_bond_length_c_n: tuple[float, float] = (1.329, 1.341)
+between_res_bond_length_stddev_c_n: tuple[float, float] = (0.014, 0.016)
+
+# Between-residue cos_angles.
+between_res_cos_angles_c_n_ca: tuple[float, float] = (-0.5203, 0.0353)  # degrees: 121.352 +- 2.315
+between_res_cos_angles_ca_c_n: tuple[float, float] = (-0.4473, 0.0311)  # degrees: 116.568 +- 1.995
+
+# This mapping is used when we need to store atom data in a format that requires
+# fixed atom data size for every residue (e.g. a numpy array).
+atom_types: list[str] = [
+    "N",
+    "CA",
+    "C",
+    "CB",
+    "O",
+    "CG",
+    "CG1",
+    "CG2",
+    "OG",
+    "OG1",
+    "SG",
+    "CD",
+    "CD1",
+    "CD2",
+    "ND1",
+    "ND2",
+    "OD1",
+    "OD2",
+    "SD",
+    "CE",
+    "CE1",
+    "CE2",
+    "CE3",
+    "NE",
+    "NE1",
+    "NE2",
+    "OE1",
+    "OE2",
+    "CH2",
+    "NH1",
+    "NH2",
+    "OH",
+    "CZ",
+    "CZ2",
+    "CZ3",
+    "NZ",
+    "OXT",
+]
+atom_order: dict[str, int] = {atom_type: i for i, atom_type in enumerate(atom_types)}
+atom_type_num = len(atom_types)  # := 37.
+
+# A compact atom encoding with 14 columns
+# pylint: disable=line-too-long
+restype_name_to_atom14_names: dict[str, list[str]] = {
+    "ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""],
+    "ARG": ["N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2", "", "", ""],
+    "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2", "", "", "", "", "", ""],
+    "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2", "", "", "", "", "", ""],
+    "CYS": ["N", "CA", "C", "O", "CB", "SG", "", "", "", "", "", "", "", ""],
+    "GLN": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2", "", "", "", "", ""],
+    "GLU": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2", "", "", "", "", ""],
+    "GLY": ["N", "CA", "C", "O", "", "", "", "", "", "", "", "", "", ""],
+    "HIS": ["N", "CA", "C", "O", "CB", "CG", "ND1", "CD2", "CE1", "NE2", "", "", "", ""],
+    "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1", "", "", "", "", "", ""],
+    "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "", "", "", "", "", ""],
+    "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ", "", "", "", "", ""],
+    "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE", "", "", "", "", "", ""],
+    "PHE": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "", "", ""],
+    "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD", "", "", "", "", "", "", ""],
+    "SER": ["N", "CA", "C", "O", "CB", "OG", "", "", "", "", "", "", "", ""],
+    "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2", "", "", "", "", "", "", ""],
+    "TRP": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "NE1", "CE2", "CE3", "CZ2", "CZ3", "CH2"],
+    "TYR": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "OH", "", ""],
+    "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "", "", "", "", "", "", ""],
+    "UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
+}
+# pylint: enable=line-too-long
+
+
+# This is the standard residue order when coding AA type as a number.
+# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
+restypes: list[str] = [
+    "A",
+    "R",
+    "N",
+    "D",
+    "C",
+    "Q",
+    "E",
+    "G",
+    "H",
+    "I",
+    "L",
+    "K",
+    "M",
+    "F",
+    "P",
+    "S",
+    "T",
+    "W",
+    "Y",
+    "V",
+]
+restype_order: dict[str, int] = {restype: i for i, restype in enumerate(restypes)}
+restype_num = len(restypes)  # := 20.
+unk_restype_index = restype_num  # Catch-all index for unknown restypes.
+
+restypes_with_x: list[str] = restypes + ["X"]
+restype_order_with_x: dict[str, int] = {restype: i for i, restype in enumerate(restypes_with_x)}
+
+
+def sequence_to_onehot(sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool = False) -> np.ndarray:
+    """Maps the given sequence into a one-hot encoded matrix.
+
+    Args:
+      sequence: An amino acid sequence.
+      mapping: A dictionary mapping amino acids to integers.
+      map_unknown_to_x: If True, any amino acid that is not in the mapping will be
+        mapped to the unknown amino acid 'X'. If the mapping doesn't contain amino acid 'X', an error will be thrown.
+        If False, any amino acid not in the mapping will throw an error.
+
+    Returns:
+      A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of the sequence.
+
+    Raises:
+      ValueError: If the mapping doesn't contain values from 0 to
+        num_unique_aas - 1 without any gaps.
+    """
+    num_entries = max(mapping.values()) + 1
+
+    if sorted(set(mapping.values())) != list(range(num_entries)):
+        raise ValueError(
+            "The mapping must have values from 0 to num_unique_aas-1 without any gaps. Got: %s"
+            % sorted(mapping.values())
+        )
+
+    one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
+
+    for aa_index, aa_type in enumerate(sequence):
+        if map_unknown_to_x:
+            if aa_type.isalpha() and aa_type.isupper():
+                aa_id = mapping.get(aa_type, mapping["X"])
+            else:
+                raise ValueError(f"Invalid character in the sequence: {aa_type}")
+        else:
+            aa_id = mapping[aa_type]
+        one_hot_arr[aa_index, aa_id] = 1
+
+    return one_hot_arr
+
+
+restype_1to3: dict[str, str] = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+}
+
+
+# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
+# 1-to-1 mapping of 3 letter names to one letter names. The latter contains
+# many more, and less common, three letter names as keys and maps many of these
+# to the same one letter name (including 'X' and 'U' which we don't use here).
+restype_3to1: dict[str, str] = {v: k for k, v in restype_1to3.items()}
+
+# Define a restype name for all unknown residues.
+unk_restype = "UNK"
+
+resnames: list[str] = [restype_1to3[r] for r in restypes] + [unk_restype]
+resname_to_idx: dict[str, int] = {resname: i for i, resname in enumerate(resnames)}
+
+
+# The mapping here uses hhblits convention, so that B is mapped to D, J and O
+# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
+# remaining 20 amino acids are kept in alphabetical order.
+# There are 2 non-amino acid codes, X (representing any amino acid) and
+# "-" representing a missing amino acid in an alignment.  The id for these
+# codes is put at the end (20 and 21) so that they can easily be ignored if
+# desired.
+HHBLITS_AA_TO_ID: dict[str, int] = {
+    "A": 0,
+    "B": 2,
+    "C": 1,
+    "D": 2,
+    "E": 3,
+    "F": 4,
+    "G": 5,
+    "H": 6,
+    "I": 7,
+    "J": 20,
+    "K": 8,
+    "L": 9,
+    "M": 10,
+    "N": 11,
+    "O": 20,
+    "P": 12,
+    "Q": 13,
+    "R": 14,
+    "S": 15,
+    "T": 16,
+    "U": 1,
+    "V": 17,
+    "W": 18,
+    "X": 20,
+    "Y": 19,
+    "Z": 3,
+    "-": 21,
+}
+
+# Partial inversion of HHBLITS_AA_TO_ID.
+ID_TO_HHBLITS_AA: dict[int, str] = {
+    0: "A",
+    1: "C",  # Also U.
+    2: "D",  # Also B.
+    3: "E",  # Also Z.
+    4: "F",
+    5: "G",
+    6: "H",
+    7: "I",
+    8: "K",
+    9: "L",
+    10: "M",
+    11: "N",
+    12: "P",
+    13: "Q",
+    14: "R",
+    15: "S",
+    16: "T",
+    17: "V",
+    18: "W",
+    19: "Y",
+    20: "X",  # Includes J and O.
+    21: "-",
+}
+
+restypes_with_x_and_gap: list[str] = restypes + ["X", "-"]
+MAP_HHBLITS_AATYPE_TO_OUR_AATYPE: tuple[int, ...] = tuple(
+    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i]) for i in range(len(restypes_with_x_and_gap))
+)
+
+
+def _make_standard_atom_mask() -> np.ndarray:
+    """Returns [num_res_types, num_atom_types] mask array."""
+    # +1 to account for unknown (all 0s).
+    mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
+    for restype, restype_letter in enumerate(restypes):
+        restype_name = restype_1to3[restype_letter]
+        atom_names = residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = atom_order[atom_name]
+            mask[restype, atom_type] = 1
+    return mask
+
+
+STANDARD_ATOM_MASK = _make_standard_atom_mask()
+
+
+# A one hot representation for the first and second atoms defining the axis
+# of rotation for each chi-angle in each residue.
+def chi_angle_atom(atom_index: int) -> np.ndarray:
+    """Define chi-angle rigid groups via one-hot representations."""
+    chi_angles_index = {}
+    one_hots = []
+
+    for k, v in chi_angles_atoms.items():
+        indices = [atom_types.index(s[atom_index]) for s in v]
+        indices.extend([-1] * (4 - len(indices)))
+        chi_angles_index[k] = indices
+
+    for r in restypes:
+        res3 = restype_1to3[r]
+        one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
+        one_hots.append(one_hot)
+
+    one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
+    one_hot = np.stack(one_hots, axis=0)
+    one_hot = np.transpose(one_hot, [0, 2, 1])
+
+    return one_hot
+
+
+chi_atom_1_one_hot = chi_angle_atom(1)
+chi_atom_2_one_hot = chi_angle_atom(2)
+
+# An array like chi_angles_atoms but using indices rather than names.
+chi_angles_atom_indices_list: list[list[list[str]]] = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
+chi_angles_atom_indices_ours: list = map_structure_with_atom_order(chi_angles_atom_indices_list)
+chi_angles_atom_indices = np.array(
+    [chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms))) for chi_atoms in chi_angles_atom_indices_list]
+)
+
+# Mapping from (res_name, atom_name) pairs to the atom's chi group index
+# and atom index within that group.
+chi_groups_for_atom: dict[tuple[str, str], list[tuple[int, int]]] = collections.defaultdict(list)
+for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
+    for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
+        for atom_i, atom in enumerate(chi_group):
+            chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
+chi_groups_for_atom = dict(chi_groups_for_atom)
+
+
+def _make_rigid_transformation_4x4(ex: np.ndarray, ey: np.ndarray, translation: np.ndarray) -> np.ndarray:
+    """Create a rigid 4x4 transformation matrix from two axes and transl."""
+    # Normalize ex.
+    ex_normalized = ex / np.linalg.norm(ex)
+
+    # make ey perpendicular to ex
+    ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
+    ey_normalized /= np.linalg.norm(ey_normalized)
+
+    # compute ez as cross product
+    eznorm = np.cross(ex_normalized, ey_normalized)
+    m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
+    m = np.concatenate([m, [[0.0, 0.0, 0.0, 1.0]]], axis=0)
+    return m
+
+
+# create an array with (restype, atomtype) --> rigid_group_idx
+# and an array with (restype, atomtype, coord) for the atom positions
+# and compute affine transformation matrices (4,4) from one rigid group to the
+# previous group
+restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=int)
+restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
+restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=int)
+restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
+restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
+restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
+
+
+def _make_rigid_group_constants() -> None:
+    """Fill the arrays above."""
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        for atomname, group_idx, atom_position in rigid_group_atom_positions[resname]:
+            atomtype = atom_order[atomname]
+            restype_atom37_to_rigid_group[restype, atomtype] = group_idx
+            restype_atom37_mask[restype, atomtype] = 1
+            restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
+
+            atom14idx = restype_name_to_atom14_names[resname].index(atomname)
+            restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
+            restype_atom14_mask[restype, atom14idx] = 1
+            restype_atom14_rigid_group_positions[restype, atom14idx, :] = atom_position
+
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_positions: dict[str, np.ndarray] = {
+            name: np.array(pos) for name, _, pos in rigid_group_atom_positions[resname]
+        }
+
+        # backbone to backbone is the identity transform
+        restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
+
+        # pre-omega-frame to backbone (currently dummy identity matrix)
+        restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
+
+        # phi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["N"] - atom_positions["CA"],
+            ey=np.array([1.0, 0.0, 0.0]),
+            translation=atom_positions["N"],
+        )
+        restype_rigid_group_default_frame[restype, 2, :, :] = mat
+
+        # psi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["C"] - atom_positions["CA"],
+            ey=atom_positions["CA"] - atom_positions["N"],
+            translation=atom_positions["C"],
+        )
+        restype_rigid_group_default_frame[restype, 3, :, :] = mat
+
+        # chi1-frame to backbone
+        if chi_angles_mask[restype][0]:
+            base_atom_names = chi_angles_atoms[resname][0]
+            base_atom_positions = [atom_positions[name] for name in base_atom_names]
+            mat = _make_rigid_transformation_4x4(
+                ex=base_atom_positions[2] - base_atom_positions[1],
+                ey=base_atom_positions[0] - base_atom_positions[1],
+                translation=base_atom_positions[2],
+            )
+            restype_rigid_group_default_frame[restype, 4, :, :] = mat
+
+        # chi2-frame to chi1-frame
+        # chi3-frame to chi2-frame
+        # chi4-frame to chi3-frame
+        # luckily all rotation axes for the next frame start at (0,0,0) of the
+        # previous frame
+        for chi_idx in range(1, 4):
+            if chi_angles_mask[restype][chi_idx]:
+                axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
+                axis_end_atom_position = atom_positions[axis_end_atom_name]
+                mat = _make_rigid_transformation_4x4(
+                    ex=axis_end_atom_position,
+                    ey=np.array([-1.0, 0.0, 0.0]),
+                    translation=axis_end_atom_position,
+                )
+                restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
+
+
+_make_rigid_group_constants()
+
+
+def make_atom14_dists_bounds(
+    overlap_tolerance: float = 1.5,
+    bond_length_tolerance_factor: int = 15,
+) -> dict[str, np.ndarray]:
+    """compute upper and lower bounds for bonds to assess violations."""
+    restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
+    residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_list = restype_name_to_atom14_names[resname]
+
+        # create lower and upper bounds for clashes
+        for atom1_idx, atom1_name in enumerate(atom_list):
+            if not atom1_name:
+                continue
+            atom1_radius = van_der_waals_radius[atom1_name[0]]
+            for atom2_idx, atom2_name in enumerate(atom_list):
+                if (not atom2_name) or atom1_idx == atom2_idx:
+                    continue
+                atom2_radius = van_der_waals_radius[atom2_name[0]]
+                lower = atom1_radius + atom2_radius - overlap_tolerance
+                upper = 1e10
+                restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+                restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+                restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+                restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+
+        # overwrite lower and upper bounds for bonds and angles
+        for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
+            atom1_idx = atom_list.index(b.atom1_name)
+            atom2_idx = atom_list.index(b.atom2_name)
+            lower = b.length - bond_length_tolerance_factor * b.stddev
+            upper = b.length + bond_length_tolerance_factor * b.stddev
+            restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+            restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+            restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+            restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+            restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
+            restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
+    return {
+        "lower_bound": restype_atom14_bond_lower_bound,  # shape (21,14,14)
+        "upper_bound": restype_atom14_bond_upper_bound,  # shape (21,14,14)
+        "stddev": restype_atom14_bond_stddev,  # shape (21,14,14)
+    }
+
+
+restype_atom14_ambiguous_atoms = np.zeros((21, 14), dtype=np.float32)
+restype_atom14_ambiguous_atoms_swap_idx: np.ndarray = np.tile(np.arange(14, dtype=int), (21, 1))
+
+
+def _make_atom14_ambiguity_feats() -> None:
+    for res, pairs in residue_atom_renaming_swaps.items():
+        res_idx = restype_order[restype_3to1[res]]
+        for atom1, atom2 in pairs.items():
+            atom1_idx = restype_name_to_atom14_names[res].index(atom1)
+            atom2_idx = restype_name_to_atom14_names[res].index(atom2)
+            restype_atom14_ambiguous_atoms[res_idx, atom1_idx] = 1
+            restype_atom14_ambiguous_atoms[res_idx, atom2_idx] = 1
+            restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom1_idx] = atom2_idx
+            restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom2_idx] = atom1_idx
+
+
+_make_atom14_ambiguity_feats()
+
+
+def aatype_to_str_sequence(aatype: Sequence[int]) -> str:
+    return "".join([restypes_with_x[aatype[i]] for i in range(len(aatype))])
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bc25cb6d53f0428be3f6c1c25e67d67edab739
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py
@@ -0,0 +1,1243 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from functools import cache
+from typing import Any, Callable
+
+import numpy as np
+import torch
+
+
+def rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting.
+
+    Args:
+        a: [*, 3, 3] left multiplicand
+        b: [*, 3, 3] right multiplicand
+    Returns:
+        The product ab
+    """
+
+    def row_mul(i: int) -> torch.Tensor:
+        return torch.stack(
+            [
+                a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0],
+                a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1],
+                a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2],
+            ],
+            dim=-1,
+        )
+
+    return torch.stack(
+        [
+            row_mul(0),
+            row_mul(1),
+            row_mul(2),
+        ],
+        dim=-2,
+    )
+
+
+def rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    """
+    Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting.
+
+    Args:
+        r: [*, 3, 3] rotation matrices
+        t: [*, 3] coordinate tensors
+    Returns:
+        [*, 3] rotated coordinates
+    """
+    x, y, z = torch.unbind(t, dim=-1)
+    return torch.stack(
+        [
+            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
+            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
+            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
+        ],
+        dim=-1,
+    )
+
+
+@cache
+def identity_rot_mats(
+    batch_dims: tuple[int, ...],
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    rots = torch.eye(3, dtype=dtype, device=device, requires_grad=requires_grad)
+    rots = rots.view(*((1,) * len(batch_dims)), 3, 3)
+    rots = rots.expand(*batch_dims, -1, -1)
+    rots = rots.contiguous()
+
+    return rots
+
+
+@cache
+def identity_trans(
+    batch_dims: tuple[int, ...],
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    trans = torch.zeros((*batch_dims, 3), dtype=dtype, device=device, requires_grad=requires_grad)
+    return trans
+
+
+@cache
+def identity_quats(
+    batch_dims: tuple[int, ...],
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    quat = torch.zeros((*batch_dims, 4), dtype=dtype, device=device, requires_grad=requires_grad)
+
+    with torch.no_grad():
+        quat[..., 0] = 1
+
+    return quat
+
+
+_quat_elements: list[str] = ["a", "b", "c", "d"]
+_qtr_keys: list[str] = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements]
+_qtr_ind_dict: dict[str, int] = {key: ind for ind, key in enumerate(_qtr_keys)}
+
+
+def _to_mat(pairs: list[tuple[str, int]]) -> np.ndarray:
+    mat = np.zeros((4, 4))
+    for key, value in pairs:
+        ind = _qtr_ind_dict[key]
+        mat[ind // 4][ind % 4] = value
+
+    return mat
+
+
+_QTR_MAT = np.zeros((4, 4, 3, 3))
+_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)])
+_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)])
+_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)])
+_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)])
+_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)])
+_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)])
+_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)])
+_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)])
+_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)])
+
+
+def quat_to_rot(quat: torch.Tensor) -> torch.Tensor:
+    """
+    Converts a quaternion to a rotation matrix.
+
+    Args:
+        quat: [*, 4] quaternions
+    Returns:
+        [*, 3, 3] rotation matrices
+    """
+    # [*, 4, 4]
+    quat = quat[..., None] * quat[..., None, :]
+
+    # [4, 4, 3, 3]
+    mat = _get_quat("_QTR_MAT", dtype=quat.dtype, device=quat.device)
+
+    # [*, 4, 4, 3, 3]
+    shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape)
+    quat = quat[..., None, None] * shaped_qtr_mat
+
+    # [*, 3, 3]
+    return torch.sum(quat, dim=(-3, -4))
+
+
+def rot_to_quat(rot: torch.Tensor) -> torch.Tensor:
+    if rot.shape[-2:] != (3, 3):
+        raise ValueError("Input rotation is incorrectly shaped")
+
+    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = [[rot[..., i, j] for j in range(3)] for i in range(3)]
+
+    k = [
+        [
+            xx + yy + zz,
+            zy - yz,
+            xz - zx,
+            yx - xy,
+        ],
+        [
+            zy - yz,
+            xx - yy - zz,
+            xy + yx,
+            xz + zx,
+        ],
+        [
+            xz - zx,
+            xy + yx,
+            yy - xx - zz,
+            yz + zy,
+        ],
+        [
+            yx - xy,
+            xz + zx,
+            yz + zy,
+            zz - xx - yy,
+        ],
+    ]
+
+    _, vectors = torch.linalg.eigh((1.0 / 3.0) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2))
+    return vectors[..., -1]
+
+
+_QUAT_MULTIPLY = np.zeros((4, 4, 4))
+_QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]]
+
+_QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]]
+
+_QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0], [0, 1, 0, 0]]
+
+_QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0], [1, 0, 0, 0]]
+
+_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :]
+
+_CACHED_QUATS: dict[str, np.ndarray] = {
+    "_QTR_MAT": _QTR_MAT,
+    "_QUAT_MULTIPLY": _QUAT_MULTIPLY,
+    "_QUAT_MULTIPLY_BY_VEC": _QUAT_MULTIPLY_BY_VEC,
+}
+
+
+@cache
+def _get_quat(quat_key: str, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device)
+
+
+def quat_multiply(quat1: torch.Tensor, quat2: torch.Tensor) -> torch.Tensor:
+    """Multiply a quaternion by another quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY", dtype=quat1.dtype, device=quat1.device)
+    reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape)
+    return torch.sum(reshaped_mat * quat1[..., :, None, None] * quat2[..., None, :, None], dim=(-3, -2))
+
+
+def quat_multiply_by_vec(quat: torch.Tensor, vec: torch.Tensor) -> torch.Tensor:
+    """Multiply a quaternion by a pure-vector quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY_BY_VEC", dtype=quat.dtype, device=quat.device)
+    reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape)
+    return torch.sum(reshaped_mat * quat[..., :, None, None] * vec[..., None, :, None], dim=(-3, -2))
+
+
+def invert_rot_mat(rot_mat: torch.Tensor) -> torch.Tensor:
+    return rot_mat.transpose(-1, -2)
+
+
+def invert_quat(quat: torch.Tensor) -> torch.Tensor:
+    quat_prime = quat.clone()
+    quat_prime[..., 1:] *= -1
+    inv = quat_prime / torch.sum(quat**2, dim=-1, keepdim=True)
+    return inv
+
+
+class Rotation:
+    """
+    A 3D rotation. Depending on how the object is initialized, the rotation is represented by either a rotation matrix
+    or a quaternion, though both formats are made available by helper functions. To simplify gradient computation, the
+    underlying format of the rotation cannot be changed in-place. Like Rigid, the class is designed to mimic the
+    behavior of a torch Tensor, almost as if each Rotation object were a tensor of rotations, in one format or another.
+    """
+
+    def __init__(
+        self,
+        rot_mats: torch.Tensor | None = None,
+        quats: torch.Tensor | None = None,
+        normalize_quats: bool = True,
+    ):
+        """
+        Args:
+            rot_mats:
+                A [*, 3, 3] rotation matrix tensor. Mutually exclusive with quats
+            quats:
+                A [*, 4] quaternion. Mutually exclusive with rot_mats. If normalize_quats is not True, must be a unit
+                quaternion
+            normalize_quats:
+                If quats is specified, whether to normalize quats
+        """
+        if (rot_mats is None and quats is None) or (rot_mats is not None and quats is not None):
+            raise ValueError("Exactly one input argument must be specified")
+
+        if (rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or (quats is not None and quats.shape[-1] != 4):
+            raise ValueError("Incorrectly shaped rotation matrix or quaternion")
+
+        # Force full-precision
+        if quats is not None:
+            quats = quats.to(dtype=torch.float32)
+        if rot_mats is not None:
+            rot_mats = rot_mats.to(dtype=torch.float32)
+
+        if quats is not None and normalize_quats:
+            quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True)
+
+        self._rot_mats = rot_mats
+        self._quats = quats
+
+    @staticmethod
+    def identity(
+        shape,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rotation:
+        """
+        Returns an identity Rotation.
+
+        Args:
+            shape:
+                The "shape" of the resulting Rotation object. See documentation for the shape property
+            dtype:
+                The torch dtype for the rotation
+            device:
+                The torch device for the new rotation
+            requires_grad:
+                Whether the underlying tensors in the new rotation object should require gradient computation
+            fmt:
+                One of "quat" or "rot_mat". Determines the underlying format of the new object's rotation
+        Returns:
+            A new identity rotation
+        """
+        if fmt == "rot_mat":
+            rot_mats = identity_rot_mats(
+                shape,
+                dtype,
+                device,
+                requires_grad,
+            )
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif fmt == "quat":
+            quats = identity_quats(shape, dtype, device, requires_grad)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError(f"Invalid format: f{fmt}")
+
+    # Magic methods
+
+    def __getitem__(self, index: Any) -> Rotation:
+        """
+        Allows torch-style indexing over the virtual shape of the rotation object. See documentation for the shape
+        property.
+
+        Args:
+            index:
+                A torch index. E.g. (1, 3, 2), or (slice(None,))
+        Returns:
+            The indexed rotation
+        """
+        if type(index) is not tuple:
+            index = (index,)
+
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats[index + (slice(None), slice(None))]
+            return Rotation(rot_mats=rot_mats)
+        elif self._quats is not None:
+            quats = self._quats[index + (slice(None),)]
+            return Rotation(quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def __mul__(self, right: torch.Tensor) -> Rotation:
+        """
+        Pointwise left multiplication of the rotation with a tensor. Can be used to e.g. mask the Rotation.
+
+        Args:
+            right:
+                The tensor multiplicand
+        Returns:
+            The product
+        """
+        if not (isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats * right[..., None, None]
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = self._quats * right[..., None]
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def __rmul__(self, left: torch.Tensor) -> Rotation:
+        """
+        Reverse pointwise multiplication of the rotation with a tensor.
+
+        Args:
+            left:
+                The left multiplicand
+        Returns:
+            The product
+        """
+        return self.__mul__(left)
+
+    # Properties
+
+    @property
+    def shape(self) -> torch.Size:
+        """
+        Returns the virtual shape of the rotation object. This shape is defined as the batch dimensions of the
+        underlying rotation matrix or quaternion. If the Rotation was initialized with a [10, 3, 3] rotation matrix
+        tensor, for example, the resulting shape would be [10].
+
+        Returns:
+            The virtual shape of the rotation object
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.shape[:-2]
+        elif self._quats is not None:
+            return self._quats.shape[:-1]
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        Returns the dtype of the underlying rotation.
+
+        Returns:
+            The dtype of the underlying rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.dtype
+        elif self._quats is not None:
+            return self._quats.dtype
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def device(self) -> torch.device:
+        """
+        The device of the underlying rotation
+
+        Returns:
+            The device of the underlying rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.device
+        elif self._quats is not None:
+            return self._quats.device
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def requires_grad(self) -> bool:
+        """
+        Returns the requires_grad property of the underlying rotation
+
+        Returns:
+            The requires_grad property of the underlying tensor
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.requires_grad
+        elif self._quats is not None:
+            return self._quats.requires_grad
+        else:
+            raise ValueError("Both rotations are None")
+
+    def get_rot_mats(self) -> torch.Tensor:
+        """
+        Returns the underlying rotation as a rotation matrix tensor.
+
+        Returns:
+            The rotation as a rotation matrix tensor
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats
+        elif self._quats is not None:
+            return quat_to_rot(self._quats)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def get_quats(self) -> torch.Tensor:
+        """
+        Returns the underlying rotation as a quaternion tensor.
+
+        Depending on whether the Rotation was initialized with a quaternion, this function may call torch.linalg.eigh.
+
+        Returns:
+            The rotation as a quaternion tensor.
+        """
+        if self._rot_mats is not None:
+            return rot_to_quat(self._rot_mats)
+        elif self._quats is not None:
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+
+    def get_cur_rot(self) -> torch.Tensor:
+        """
+        Return the underlying rotation in its current form
+
+        Returns:
+            The stored rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats
+        elif self._quats is not None:
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+
+    # Rotation functions
+
+    def compose_q_update_vec(self, q_update_vec: torch.Tensor, normalize_quats: bool = True) -> Rotation:
+        """
+        Returns a new quaternion Rotation after updating the current object's underlying rotation with a quaternion
+        update, formatted as a [*, 3] tensor whose final three columns represent x, y, z such that (1, x, y, z) is the
+        desired (not necessarily unit) quaternion update.
+
+        Args:
+            q_update_vec:
+                A [*, 3] quaternion update tensor
+            normalize_quats:
+                Whether to normalize the output quaternion
+        Returns:
+            An updated Rotation
+        """
+        quats = self.get_quats()
+        new_quats = quats + quat_multiply_by_vec(quats, q_update_vec)
+        return Rotation(
+            rot_mats=None,
+            quats=new_quats,
+            normalize_quats=normalize_quats,
+        )
+
+    def compose_r(self, r: Rotation) -> Rotation:
+        """
+        Compose the rotation matrices of the current Rotation object with those of another.
+
+        Args:
+            r:
+                An update rotation object
+        Returns:
+            An updated rotation object
+        """
+        r1 = self.get_rot_mats()
+        r2 = r.get_rot_mats()
+        new_rot_mats = rot_matmul(r1, r2)
+        return Rotation(rot_mats=new_rot_mats, quats=None)
+
+    def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation:
+        """
+        Compose the quaternions of the current Rotation object with those of another.
+
+        Depending on whether either Rotation was initialized with quaternions, this function may call
+        torch.linalg.eigh.
+
+        Args:
+            r:
+                An update rotation object
+        Returns:
+            An updated rotation object
+        """
+        q1 = self.get_quats()
+        q2 = r.get_quats()
+        new_quats = quat_multiply(q1, q2)
+        return Rotation(rot_mats=None, quats=new_quats, normalize_quats=normalize_quats)
+
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the current Rotation as a rotation matrix to a set of 3D coordinates.
+
+        Args:
+            pts:
+                A [*, 3] set of points
+        Returns:
+            [*, 3] rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        return rot_vec_mul(rot_mats, pts)
+
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        The inverse of the apply() method.
+
+        Args:
+            pts:
+                A [*, 3] set of points
+        Returns:
+            [*, 3] inverse-rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        inv_rot_mats = invert_rot_mat(rot_mats)
+        return rot_vec_mul(inv_rot_mats, pts)
+
+    def invert(self) -> Rotation:
+        """
+        Returns the inverse of the current Rotation.
+
+        Returns:
+            The inverse of the current Rotation
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=invert_rot_mat(self._rot_mats), quats=None)
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=invert_quat(self._quats),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+    # "Tensor" stuff
+
+    def unsqueeze(self, dim: int) -> Rotation:
+        """
+        Analogous to torch.unsqueeze. The dimension is relative to the shape of the Rotation object.
+
+        Args:
+            dim: A positive or negative dimension index.
+        Returns:
+            The unsqueezed Rotation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2)
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    @staticmethod
+    def cat(rs: Sequence[Rotation], dim: int) -> Rotation:
+        """
+        Concatenates rotations along one of the batch dimensions. Analogous to torch.cat().
+
+        Note that the output of this operation is always a rotation matrix, regardless of the format of input
+        rotations.
+
+        Args:
+            rs:
+                A list of rotation objects
+            dim:
+                The dimension along which the rotations should be concatenated
+        Returns:
+            A concatenated Rotation object in rotation matrix format
+        """
+        rot_mats = torch.cat(
+            [r.get_rot_mats() for r in rs],
+            dim=dim if dim >= 0 else dim - 2,
+        )
+
+        return Rotation(rot_mats=rot_mats, quats=None)
+
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rotation:
+        """
+        Apply a Tensor -> Tensor function to underlying rotation tensors, mapping over the rotation dimension(s). Can
+        be used e.g. to sum out a one-hot batch dimension.
+
+        Args:
+            fn:
+                A Tensor -> Tensor function to be mapped over the Rotation
+        Returns:
+            The transformed Rotation object
+        """
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,))
+            rot_mats = torch.stack(list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1)
+            rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3))
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = torch.stack(list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def cuda(self) -> Rotation:
+        """
+        Analogous to the cuda() method of torch Tensors
+
+        Returns:
+            A copy of the Rotation in CUDA memory
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=self._rot_mats.cuda(), quats=None)
+        elif self._quats is not None:
+            return Rotation(rot_mats=None, quats=self._quats.cuda(), normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def to(self, device: torch.device | None, dtype: torch.dtype | None) -> Rotation:
+        """
+        Analogous to the to() method of torch Tensors
+
+        Args:
+            device:
+                A torch device
+            dtype:
+                A torch dtype
+        Returns:
+            A copy of the Rotation using the new device and dtype
+        """
+        if self._rot_mats is not None:
+            return Rotation(
+                rot_mats=self._rot_mats.to(device=device, dtype=dtype),
+                quats=None,
+            )
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.to(device=device, dtype=dtype),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+    def detach(self) -> Rotation:
+        """
+        Returns a copy of the Rotation whose underlying Tensor has been detached from its torch graph.
+
+        Returns:
+            A copy of the Rotation whose underlying Tensor has been detached from its torch graph
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=self._rot_mats.detach(), quats=None)
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.detach(),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+
+class Rigid:
+    """
+    A class representing a rigid transformation. Little more than a wrapper around two objects: a Rotation object and a
+    [*, 3] translation Designed to behave approximately like a single torch tensor with the shape of the shared batch
+    dimensions of its component parts.
+    """
+
+    def __init__(self, rots: Rotation | None, trans: torch.Tensor | None):
+        """
+        Args:
+            rots: A [*, 3, 3] rotation tensor
+            trans: A corresponding [*, 3] translation tensor
+        """
+        # (we need device, dtype, etc. from at least one input)
+
+        batch_dims, dtype, device, requires_grad = None, None, None, None
+        if trans is not None:
+            batch_dims = trans.shape[:-1]
+            dtype = trans.dtype
+            device = trans.device
+            requires_grad = trans.requires_grad
+        elif rots is not None:
+            batch_dims = rots.shape
+            dtype = rots.dtype
+            device = rots.device
+            requires_grad = rots.requires_grad
+        else:
+            raise ValueError("At least one input argument must be specified")
+
+        if rots is None:
+            rots = Rotation.identity(
+                batch_dims,
+                dtype,
+                device,
+                requires_grad,
+            )
+        elif trans is None:
+            trans = identity_trans(
+                batch_dims,
+                dtype,
+                device,
+                requires_grad,
+            )
+
+        assert rots is not None
+        assert trans is not None
+
+        if (rots.shape != trans.shape[:-1]) or (rots.device != trans.device):
+            raise ValueError("Rots and trans incompatible")
+
+        # Force full precision. Happens to the rotations automatically.
+        trans = trans.to(dtype=torch.float32)
+
+        self._rots = rots
+        self._trans = trans
+
+    @staticmethod
+    def identity(
+        shape: tuple[int, ...],
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rigid:
+        """
+        Constructs an identity transformation.
+
+        Args:
+            shape:
+                The desired shape
+            dtype:
+                The dtype of both internal tensors
+            device:
+                The device of both internal tensors
+            requires_grad:
+                Whether grad should be enabled for the internal tensors
+        Returns:
+            The identity transformation
+        """
+        return Rigid(
+            Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt),
+            identity_trans(shape, dtype, device, requires_grad),
+        )
+
+    def __getitem__(self, index: Any) -> Rigid:
+        """
+        Indexes the affine transformation with PyTorch-style indices. The index is applied to the shared dimensions of
+        both the rotation and the translation.
+
+        E.g.::
+
+            r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None) t = Rigid(r, torch.rand(10, 10, 3)) indexed =
+            t[3, 4:6] assert(indexed.shape == (2,)) assert(indexed.get_rots().shape == (2,))
+            assert(indexed.get_trans().shape == (2, 3))
+
+        Args:
+            index: A standard torch tensor index. E.g. 8, (10, None, 3),
+            or (3, slice(0, 1, None))
+        Returns:
+            The indexed tensor
+        """
+        if type(index) is not tuple:
+            index = (index,)
+
+        return Rigid(
+            self._rots[index],
+            self._trans[index + (slice(None),)],
+        )
+
+    def __mul__(self, right: torch.Tensor) -> Rigid:
+        """
+        Pointwise left multiplication of the transformation with a tensor. Can be used to e.g. mask the Rigid.
+
+        Args:
+            right:
+                The tensor multiplicand
+        Returns:
+            The product
+        """
+        if not (isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+
+        new_rots = self._rots * right
+        new_trans = self._trans * right[..., None]
+
+        return Rigid(new_rots, new_trans)
+
+    def __rmul__(self, left: torch.Tensor) -> Rigid:
+        """
+        Reverse pointwise multiplication of the transformation with a tensor.
+
+        Args:
+            left:
+                The left multiplicand
+        Returns:
+            The product
+        """
+        return self.__mul__(left)
+
+    @property
+    def shape(self) -> torch.Size:
+        """
+        Returns the shape of the shared dimensions of the rotation and the translation.
+
+        Returns:
+            The shape of the transformation
+        """
+        return self._trans.shape[:-1]
+
+    @property
+    def device(self) -> torch.device:
+        """
+        Returns the device on which the Rigid's tensors are located.
+
+        Returns:
+            The device on which the Rigid's tensors are located
+        """
+        return self._trans.device
+
+    def get_rots(self) -> Rotation:
+        """
+        Getter for the rotation.
+
+        Returns:
+            The rotation object
+        """
+        return self._rots
+
+    def get_trans(self) -> torch.Tensor:
+        """
+        Getter for the translation.
+
+        Returns:
+            The stored translation
+        """
+        return self._trans
+
+    def compose_q_update_vec(self, q_update_vec: torch.Tensor) -> Rigid:
+        """
+        Composes the transformation with a quaternion update vector of shape [*, 6], where the final 6 columns
+        represent the x, y, and z values of a quaternion of form (1, x, y, z) followed by a 3D translation.
+
+        Args:
+            q_vec: The quaternion update vector.
+        Returns:
+            The composed transformation.
+        """
+        q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:]
+        new_rots = self._rots.compose_q_update_vec(q_vec)
+
+        trans_update = self._rots.apply(t_vec)
+        new_translation = self._trans + trans_update
+
+        return Rigid(new_rots, new_translation)
+
+    def compose(self, r: Rigid) -> Rigid:
+        """
+        Composes the current rigid object with another.
+
+        Args:
+            r:
+                Another Rigid object
+        Returns:
+            The composition of the two transformations
+        """
+        new_rot = self._rots.compose_r(r._rots)
+        new_trans = self._rots.apply(r._trans) + self._trans
+        return Rigid(new_rot, new_trans)
+
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Applies the transformation to a coordinate tensor.
+
+        Args:
+            pts: A [*, 3] coordinate tensor.
+        Returns:
+            The transformed points.
+        """
+        rotated = self._rots.apply(pts)
+        return rotated + self._trans
+
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Applies the inverse of the transformation to a coordinate tensor.
+
+        Args:
+            pts: A [*, 3] coordinate tensor
+        Returns:
+            The transformed points.
+        """
+        pts = pts - self._trans
+        return self._rots.invert_apply(pts)
+
+    def invert(self) -> Rigid:
+        """
+        Inverts the transformation.
+
+        Returns:
+            The inverse transformation.
+        """
+        rot_inv = self._rots.invert()
+        trn_inv = rot_inv.apply(self._trans)
+
+        return Rigid(rot_inv, -1 * trn_inv)
+
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid:
+        """
+        Apply a Tensor -> Tensor function to underlying translation and rotation tensors, mapping over the
+        translation/rotation dimensions respectively.
+
+        Args:
+            fn:
+                A Tensor -> Tensor function to be mapped over the Rigid
+        Returns:
+            The transformed Rigid object
+        """
+        new_rots = self._rots.map_tensor_fn(fn)
+        new_trans = torch.stack(list(map(fn, torch.unbind(self._trans, dim=-1))), dim=-1)
+
+        return Rigid(new_rots, new_trans)
+
+    def to_tensor_4x4(self) -> torch.Tensor:
+        """
+        Converts a transformation to a homogeneous transformation tensor.
+
+        Returns:
+            A [*, 4, 4] homogeneous transformation tensor
+        """
+        tensor = self._trans.new_zeros((*self.shape, 4, 4))
+        tensor[..., :3, :3] = self._rots.get_rot_mats()
+        tensor[..., :3, 3] = self._trans
+        tensor[..., 3, 3] = 1
+        return tensor
+
+    @staticmethod
+    def from_tensor_4x4(t: torch.Tensor) -> Rigid:
+        """
+        Constructs a transformation from a homogeneous transformation tensor.
+
+        Args:
+            t: [*, 4, 4] homogeneous transformation tensor
+        Returns:
+            T object with shape [*]
+        """
+        if t.shape[-2:] != (4, 4):
+            raise ValueError("Incorrectly shaped input tensor")
+
+        rots = Rotation(rot_mats=t[..., :3, :3], quats=None)
+        trans = t[..., :3, 3]
+
+        return Rigid(rots, trans)
+
+    def to_tensor_7(self) -> torch.Tensor:
+        """
+        Converts a transformation to a tensor with 7 final columns, four for the quaternion followed by three for the
+        translation.
+
+        Returns:
+            A [*, 7] tensor representation of the transformation
+        """
+        tensor = self._trans.new_zeros((*self.shape, 7))
+        tensor[..., :4] = self._rots.get_quats()
+        tensor[..., 4:] = self._trans
+
+        return tensor
+
+    @staticmethod
+    def from_tensor_7(t: torch.Tensor, normalize_quats: bool = False) -> Rigid:
+        if t.shape[-1] != 7:
+            raise ValueError("Incorrectly shaped input tensor")
+
+        quats, trans = t[..., :4], t[..., 4:]
+
+        rots = Rotation(rot_mats=None, quats=quats, normalize_quats=normalize_quats)
+
+        return Rigid(rots, trans)
+
+    @staticmethod
+    def from_3_points(
+        p_neg_x_axis: torch.Tensor, origin: torch.Tensor, p_xy_plane: torch.Tensor, eps: float = 1e-8
+    ) -> Rigid:
+        """
+        Implements algorithm 21. Constructs transformations from sets of 3 points using the Gram-Schmidt algorithm.
+
+        Args:
+            p_neg_x_axis: [*, 3] coordinates
+            origin: [*, 3] coordinates used as frame origins
+            p_xy_plane: [*, 3] coordinates
+            eps: Small epsilon value
+        Returns:
+            A transformation object of shape [*]
+        """
+        p_neg_x_axis_unbound = torch.unbind(p_neg_x_axis, dim=-1)
+        origin_unbound = torch.unbind(origin, dim=-1)
+        p_xy_plane_unbound = torch.unbind(p_xy_plane, dim=-1)
+
+        e0 = [c1 - c2 for c1, c2 in zip(origin_unbound, p_neg_x_axis_unbound)]
+        e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane_unbound, origin_unbound)]
+
+        denom = torch.sqrt(sum(c * c for c in e0) + eps * torch.ones_like(e0[0]))
+        e0 = [c / denom for c in e0]
+        dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
+        e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
+        denom = torch.sqrt(sum(c * c for c in e1) + eps * torch.ones_like(e1[0]))
+        e1 = [c / denom for c in e1]
+        e2 = [
+            e0[1] * e1[2] - e0[2] * e1[1],
+            e0[2] * e1[0] - e0[0] * e1[2],
+            e0[0] * e1[1] - e0[1] * e1[0],
+        ]
+
+        rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1)
+        rots = rots.reshape(rots.shape[:-1] + (3, 3))
+
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+
+        return Rigid(rot_obj, torch.stack(origin_unbound, dim=-1))
+
+    def unsqueeze(self, dim: int) -> Rigid:
+        """
+        Analogous to torch.unsqueeze. The dimension is relative to the shared dimensions of the rotation/translation.
+
+        Args:
+            dim: A positive or negative dimension index.
+        Returns:
+            The unsqueezed transformation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        rots = self._rots.unsqueeze(dim)
+        trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1)
+
+        return Rigid(rots, trans)
+
+    @staticmethod
+    def cat(ts: Sequence[Rigid], dim: int) -> Rigid:
+        """
+        Concatenates transformations along a new dimension.
+
+        Args:
+            ts:
+                A list of T objects
+            dim:
+                The dimension along which the transformations should be concatenated
+        Returns:
+            A concatenated transformation object
+        """
+        rots = Rotation.cat([t._rots for t in ts], dim)
+        trans = torch.cat([t._trans for t in ts], dim=dim if dim >= 0 else dim - 1)
+
+        return Rigid(rots, trans)
+
+    def apply_rot_fn(self, fn: Callable[[Rotation], Rotation]) -> Rigid:
+        """
+        Applies a Rotation -> Rotation function to the stored rotation object.
+
+        Args:
+            fn: A function of type Rotation -> Rotation
+        Returns:
+            A transformation object with a transformed rotation.
+        """
+        return Rigid(fn(self._rots), self._trans)
+
+    def apply_trans_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid:
+        """
+        Applies a Tensor -> Tensor function to the stored translation.
+
+        Args:
+            fn:
+                A function of type Tensor -> Tensor to be applied to the translation
+        Returns:
+            A transformation object with a transformed translation.
+        """
+        return Rigid(self._rots, fn(self._trans))
+
+    def scale_translation(self, trans_scale_factor: float) -> Rigid:
+        """
+        Scales the translation by a constant factor.
+
+        Args:
+            trans_scale_factor:
+                The constant factor
+        Returns:
+            A transformation object with a scaled translation.
+        """
+        return self.apply_trans_fn(lambda t: t * trans_scale_factor)
+
+    def stop_rot_gradient(self) -> Rigid:
+        """
+        Detaches the underlying rotation object
+
+        Returns:
+            A transformation object with detached rotations
+        """
+        return self.apply_rot_fn(lambda r: r.detach())
+
+    @staticmethod
+    def make_transform_from_reference(
+        n_xyz: torch.Tensor, ca_xyz: torch.Tensor, c_xyz: torch.Tensor, eps: float = 1e-20
+    ) -> Rigid:
+        """
+        Returns a transformation object from reference coordinates.
+
+        Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard
+        way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You
+        need to take care of such cases in your code.
+
+        Args:
+            n_xyz: A [*, 3] tensor of nitrogen xyz coordinates.
+            ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates.
+            c_xyz: A [*, 3] tensor of carbon xyz coordinates.
+        Returns:
+            A transformation object. After applying the translation and rotation to the reference backbone, the
+            coordinates will approximately equal to the input coordinates.
+        """
+        translation = -1 * ca_xyz
+        n_xyz = n_xyz + translation
+        c_xyz = c_xyz + translation
+
+        c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + c_x**2 + c_y**2)
+        sin_c1 = -c_y / norm
+        cos_c1 = c_x / norm
+
+        c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3))
+        c1_rots[..., 0, 0] = cos_c1
+        c1_rots[..., 0, 1] = -1 * sin_c1
+        c1_rots[..., 1, 0] = sin_c1
+        c1_rots[..., 1, 1] = cos_c1
+        c1_rots[..., 2, 2] = 1
+
+        norm = torch.sqrt(eps + c_x**2 + c_y**2 + c_z**2)
+        sin_c2 = c_z / norm
+        cos_c2 = torch.sqrt(c_x**2 + c_y**2) / norm
+
+        c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        c2_rots[..., 0, 0] = cos_c2
+        c2_rots[..., 0, 2] = sin_c2
+        c2_rots[..., 1, 1] = 1
+        c2_rots[..., 2, 0] = -1 * sin_c2
+        c2_rots[..., 2, 2] = cos_c2
+
+        c_rots = rot_matmul(c2_rots, c1_rots)
+        n_xyz = rot_vec_mul(c_rots, n_xyz)
+
+        _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + n_y**2 + n_z**2)
+        sin_n = -n_z / norm
+        cos_n = n_y / norm
+
+        n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        n_rots[..., 0, 0] = 1
+        n_rots[..., 1, 1] = cos_n
+        n_rots[..., 1, 2] = -1 * sin_n
+        n_rots[..., 2, 1] = sin_n
+        n_rots[..., 2, 2] = cos_n
+
+        rots = rot_matmul(n_rots, c_rots)
+
+        rots = rots.transpose(-1, -2)
+        translation = -1 * translation
+
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+
+        return Rigid(rot_obj, translation)
+
+    def cuda(self) -> Rigid:
+        """
+        Moves the transformation object to GPU memory
+
+        Returns:
+            A version of the transformation on GPU
+        """
+        return Rigid(self._rots.cuda(), self._trans.cuda())
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..449c810aed3f51b34a96674f29d5dc8bdfc35103
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py
@@ -0,0 +1,140 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Any, Callable, TypeVar, Union, overload
+
+import torch
+import torch.nn as nn
+import torch.types
+
+
+def add(m1: torch.Tensor, m2: torch.Tensor, inplace: bool) -> torch.Tensor:
+    # The first operation in a checkpoint can't be in-place, but it's
+    # nice to have in-place addition during inference. Thus...
+    if not inplace:
+        m1 = m1 + m2
+    else:
+        m1 += m2
+
+    return m1
+
+
+def permute_final_dims(tensor: torch.Tensor, inds: list[int]) -> torch.Tensor:
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+def flatten_final_dims(t: torch.Tensor, no_dims: int) -> torch.Tensor:
+    return t.reshape(t.shape[:-no_dims] + (-1,))
+
+
+def masked_mean(mask: torch.Tensor, value: torch.Tensor, dim: int, eps: float = 1e-4) -> torch.Tensor:
+    mask = mask.expand(*value.shape)
+    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
+
+
+def pts_to_distogram(
+    pts: torch.Tensor, min_bin: torch.types.Number = 2.3125, max_bin: torch.types.Number = 21.6875, no_bins: int = 64
+) -> torch.Tensor:
+    boundaries = torch.linspace(min_bin, max_bin, no_bins - 1, device=pts.device)
+    dists = torch.sqrt(torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1))
+    return torch.bucketize(dists, boundaries)
+
+
+def dict_multimap(fn: Callable[[list], Any], dicts: list[dict]) -> dict:
+    first = dicts[0]
+    new_dict = {}
+    for k, v in first.items():
+        all_v = [d[k] for d in dicts]
+        if isinstance(v, dict):
+            new_dict[k] = dict_multimap(fn, all_v)
+        else:
+            new_dict[k] = fn(all_v)
+
+    return new_dict
+
+
+def one_hot(x: torch.Tensor, v_bins: torch.Tensor) -> torch.Tensor:
+    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
+    diffs = x[..., None] - reshaped_bins
+    am = torch.argmin(torch.abs(diffs), dim=-1)
+    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
+
+
+def batched_gather(data: torch.Tensor, inds: torch.Tensor, dim: int = 0, no_batch_dims: int = 0) -> torch.Tensor:
+    ranges: list[Union[slice, torch.Tensor]] = []
+    for i, s in enumerate(data.shape[:no_batch_dims]):
+        r = torch.arange(s)
+        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
+        ranges.append(r)
+
+    remaining_dims: list[Union[slice, torch.Tensor]] = [slice(None) for _ in range(len(data.shape) - no_batch_dims)]
+    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
+    ranges.extend(remaining_dims)
+    # Matt note: Editing this to get around the behaviour of using a list as an array index changing
+    # in recent Numpy versions
+    return data[tuple(ranges)]
+
+
+T = TypeVar("T")
+
+
+# With tree_map, a poor man's JAX tree_map
+def dict_map(
+    fn: Callable[[T], Any], dic: dict[Any, Union[dict, list, tuple, T]], leaf_type: type[T]
+) -> dict[Any, Union[dict, list, tuple, Any]]:
+    new_dict: dict[Any, Union[dict, list, tuple, Any]] = {}
+    for k, v in dic.items():
+        if isinstance(v, dict):
+            new_dict[k] = dict_map(fn, v, leaf_type)
+        else:
+            new_dict[k] = tree_map(fn, v, leaf_type)
+
+    return new_dict
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: T, leaf_type: type[T]) -> Any: ...
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: dict, leaf_type: type[T]) -> dict: ...
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: list, leaf_type: type[T]) -> list: ...
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: tuple, leaf_type: type[T]) -> tuple: ...
+
+
+def tree_map(fn, tree, leaf_type):
+    if isinstance(tree, dict):
+        return dict_map(fn, tree, leaf_type)
+    elif isinstance(tree, list):
+        return [tree_map(fn, x, leaf_type) for x in tree]
+    elif isinstance(tree, tuple):
+        return tuple(tree_map(fn, x, leaf_type) for x in tree)
+    elif isinstance(tree, leaf_type):
+        return fn(tree)
+    else:
+        print(type(tree))
+        raise TypeError("Not supported")
+
+
+tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
diff --git a/phivenv/Lib/site-packages/transformers/models/esm/tokenization_esm.py b/phivenv/Lib/site-packages/transformers/models/esm/tokenization_esm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d9705f7dbd33216a327eab04415ec57fe8e858d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/esm/tokenization_esm.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for ESM."""
+
+import os
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+def load_vocab_file(vocab_file):
+    with open(vocab_file, "r") as f:
+        lines = f.read().splitlines()
+        return [l.strip() for l in lines]
+
+
+class EsmTokenizer(PreTrainedTokenizer):
+    """
+    Constructs an ESM tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        cls_token="<cls>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        eos_token="<eos>",
+        **kwargs,
+    ):
+        self.all_tokens = load_vocab_file(vocab_file)
+        self._id_to_token = dict(enumerate(self.all_tokens))
+        self._token_to_id = {tok: ind for ind, tok in enumerate(self.all_tokens)}
+        super().__init__(
+            unk_token=unk_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            eos_token=eos_token,
+            **kwargs,
+        )
+
+        # TODO, all the tokens are added? But they are also part of the vocab... bit strange.
+        # none of them are special, but they all need special splitting.
+
+        self.unique_no_split_tokens = self.all_tokens
+        self._update_trie(self.unique_no_split_tokens)
+
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._id_to_token.get(index, self.unk_token)
+
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._token_to_id.get(token, self._token_to_id.get(self.unk_token))
+
+    def _tokenize(self, text, **kwargs):
+        return text.split()
+
+    def get_vocab(self):
+        base_vocab = self._token_to_id.copy()
+        base_vocab.update(self.added_tokens_encoder)
+        return base_vocab
+
+    def token_to_id(self, token: str) -> int:
+        return self._token_to_id.get(token, self._token_to_id.get(self.unk_token))
+
+    def id_to_token(self, index: int) -> str:
+        return self._id_to_token.get(index, self.unk_token)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        cls = [self.cls_token_id]
+        sep = [self.eos_token_id]  # No sep token in ESM vocabulary
+        if token_ids_1 is None:
+            if self.eos_token_id is None:
+                return cls + token_ids_0
+            else:
+                return cls + token_ids_0 + sep
+        elif self.eos_token_id is None:
+            raise ValueError("Cannot tokenize multiple sequences when EOS token is not set!")
+        return cls + token_ids_0 + sep + token_ids_1 + sep  # Multiple inputs always have an EOS token
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (`list[int]`, *optional*):
+                List of ids of the second sequence.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+
+            return [1 if token in self.all_special_ids else 0 for token in token_ids_0]
+        mask = [1] + ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            mask += [0] * len(token_ids_1) + [1]
+        return mask
+
+    def save_vocabulary(self, save_directory, filename_prefix):
+        vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.txt")
+        with open(vocab_file, "w") as f:
+            f.write("\n".join(self.all_tokens))
+        return (vocab_file,)
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.all_tokens)
+
+
+__all__ = ["EsmTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/__init__.py b/phivenv/Lib/site-packages/transformers/models/evolla/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..09be74f03397074c49bf0df9a34b67f695856131
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/evolla/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_evolla import *
+    from .modeling_evolla import *
+    from .processing_evolla import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e33a4db758f99a4955b940dbb333af6c5309eece
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbf6476c65aec8383fdb80dbe55ce09c71baef3c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51cdc44cf28b12c05170b63d85e298f4e444ae4e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1304d24be849f061d9e77b516d948d74268cb446
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2470870de11ec03bc487e3789944e1e0533ac9d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/configuration_evolla.py b/phivenv/Lib/site-packages/transformers/models/evolla/configuration_evolla.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6d0361e95fb996075f2c14e7531b14164d5545e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/evolla/configuration_evolla.py
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2025 Westlake Representational Learning Lab (Fajie Yuan Lab) team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evolla model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SaProtConfig(PretrainedConfig):
+    r"""This is the configuration class to store the configuration of a [`EvollaSaProtProteinEncoder`]. It is used to instantiate a
+    SaProt model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 446):
+            Vocabulary size of the protein sequence model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`EvollaModel`].
+        mask_token_id (`int`, *optional*, defaults to 4):
+            The id of the *mask* token in the protein sequence model.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the *padding* token in the protein sequence model.
+        hidden_size (`int`, *optional*, defaults to 1280):
+            Dimensionality of the protein sequence model layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 33):
+            Number of hidden layers in the protein sequence model.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the protein sequence model.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the intermediate layers in the protein sequence model.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the hidden layers in the protein sequence model.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in the protein sequence model.
+        max_position_embeddings (`int`, *optional*, defaults to 1026):
+            The maximum sequence length that the protein sequence model might ever be used with. Typically set this to
+            something large just in case (e.g., 512 or 1024 or 2048).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value for the layer normalization layer in the protein sequence model.
+        position_embedding_type (`str`, *optional*, defaults to `"rotary"`):
+            The type of position embedding to use in the protein sequence model. Currently only `"rotary"` is supported.
+        emb_layer_norm_before (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization before the position embedding in the protein sequence model.
+        token_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to apply dropout to the tokens in the protein sequence model."""
+
+    def __init__(
+        self,
+        vocab_size=446,
+        mask_token_id=4,
+        pad_token_id=1,
+        hidden_size=1280,
+        num_hidden_layers=33,
+        num_attention_heads=20,
+        intermediate_size=5120,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=1026,
+        initializer_range=0.02,
+        layer_norm_eps=1e-05,
+        position_embedding_type="rotary",
+        emb_layer_norm_before=False,
+        token_dropout=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.emb_layer_norm_before = emb_layer_norm_before
+        self.token_dropout = token_dropout
+
+
+class EvollaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EvollaModel`]. It is used to instantiate an
+    Evolla model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Evolla-10B.
+
+    e.g. [westlake-repl/Evolla-10B-hf](https://huggingface.co/westlake-repl/Evolla-10B-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        protein_encoder_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SaProtConfig`].
+        vocab_size (`int`, *optional*, defaults to 128256):
+            Vocabulary size of the Evolla llama model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`EvollaModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the llama layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimensionality of the intermediate layers in the llama model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the llama model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the llama model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            Number of key-value pairs for each attention layer in the llama model.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the llama model. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"silu"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value for the RMS-norm layer in the llama model.
+        rope_theta (`float`, *optional*, defaults to 500000.0):
+            The threshold value for the RoPE layer in the llama model.
+        rope_scaling (`float`, *optional*):
+            The scaling factor for the RoPE layer in the llama model.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the attention layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention layer.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the MLP layer.
+        aligner_ffn_mult (`int`, *optional*, defaults to 4):
+            The FFN multiplier for the aligner layer.
+        aligner_enable_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the aligner layer.
+        aligner_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in the aligner layer.
+        aligner_num_add_layers (`int`, *optional*, defaults to 8):
+            The number of additional layers for the aligner layer.
+        resampler_depth (`int`, *optional*, defaults to 6):
+            The depth of the resampler layer in the llama model.
+        resampler_dim_head (`int`, *optional*, defaults to 64):
+            The dimension of the heads in the resampler layer in the llama model.
+        resampler_heads (`int`, *optional*, defaults to 8):
+            The number of heads in the resampler layer in the llama model.
+        resampler_num_latents (`int`, *optional*, defaults to 64):
+            The number of latents in the resampler layer in the llama model.
+        resampler_ff_mult (`int`, *optional*, defaults to 4):
+            The FFN multiplier for the resampler layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        bos_token_id (`int`, *optional*, defaults to 128000):
+            The id of the *beginning-of-sequence* token.
+        eos_token_id (`int`, *optional*, defaults to 128009):
+            The id of the *end-of-sequence* token.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the input and output word embeddings.
+
+    Example:
+
+    ```python
+    >>> from transformers import EvollaModel, EvollaConfig
+
+    >>> # Initializing a Evolla evolla-10b style configuration
+    >>> configuration = EvollaConfig()
+
+    >>> # Initializing a model from the evolla-10b style configuration
+    >>> model = EvollaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "EvollaModel"
+    sub_configs = {"protein_encoder_config": SaProtConfig}
+
+    def __init__(
+        self,
+        protein_encoder_config=None,
+        vocab_size=128256,  # llama vocab size
+        hidden_size=4096,  # llama hidden size
+        intermediate_size=14336,  # llama intermediate size
+        num_hidden_layers=32,  # llama num layers
+        num_attention_heads=32,  # llama num heads
+        num_key_value_heads=8,  # llama num key-value heads
+        hidden_act="silu",  # llama activation function
+        max_position_embeddings=8192,  # llama rope max length
+        rms_norm_eps=1e-05,
+        rope_theta=500000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        aligner_ffn_mult=4,
+        aligner_enable_bias=True,
+        aligner_attention_probs_dropout_prob=0.1,
+        aligner_num_add_layers=8,
+        resampler_depth=6,
+        resampler_dim_head=64,
+        resampler_heads=8,
+        resampler_num_latents=64,
+        resampler_ff_mult=4,
+        initializer_range=0.02,
+        pad_token_id=None,
+        bos_token_id=128000,
+        eos_token_id=128009,
+        use_cache=False,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        self.tie_word_embeddings = tie_word_embeddings
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.aligner_ffn_mult = aligner_ffn_mult
+        self.aligner_enable_bias = aligner_enable_bias
+        self.aligner_attention_probs_dropout_prob = aligner_attention_probs_dropout_prob
+        self.aligner_num_add_layers = aligner_num_add_layers
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+
+        self.resampler_depth = resampler_depth
+        self.resampler_dim_head = resampler_dim_head
+        self.resampler_heads = resampler_heads
+        self.resampler_num_latents = resampler_num_latents
+        self.resampler_ff_mult = resampler_ff_mult
+
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # Subconfig
+        if protein_encoder_config is None:
+            protein_encoder_config = {}
+            logger.info("`protein_encoder_config` is `None`. Initializing the `SaProtConfig` with default values.")
+        self.protein_encoder_config = SaProtConfig(**protein_encoder_config)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["EvollaConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/modeling_evolla.py b/phivenv/Lib/site-packages/transformers/models/evolla/modeling_evolla.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0677d5f600d3c78cf12cb39409b1e5084216c12
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/evolla/modeling_evolla.py
@@ -0,0 +1,1582 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/evolla/modular_evolla.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_evolla.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Westlake Representational Learning Lab (Fajie Yuan Lab) team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithCrossAttentions,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithPast,
+    ModelOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, ModuleUtilsMixin, PreTrainedModel, get_parameter_dtype
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder, check_model_inputs
+from .configuration_evolla import EvollaConfig, SaProtConfig
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
+    return incremental_indices.long() + padding_idx
+
+
+class EvollaSaProtEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        if config.emb_layer_norm_before:
+            self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.layer_norm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+        self.padding_idx = config.pad_token_id
+        if self.position_embedding_type == "absolute":
+            self.position_embeddings = nn.Embedding(
+                config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+            )
+        self.token_dropout = config.token_dropout
+        self.mask_token_id = config.mask_token_id
+        # remove the position_ids in EsmEmbeddings
+        self.position_ids = None
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # Note that if we want to support EVOLLA_SA_PROT-1 (not 1b!) in future then we need to support an
+        # embedding_scale factor here.
+        embeddings = inputs_embeds
+
+        # Matt: EVOLLA_SA_PROT has the option to handle masking in MLM in a slightly unusual way. If the token_dropout
+        # flag is False then it is handled in the same was as BERT/RoBERTa. If it is set to True, however,
+        # masked tokens are treated as if they were selected for input dropout and zeroed out.
+        # This "mask-dropout" is compensated for when masked tokens are not present, by scaling embeddings by
+        # a factor of (fraction of unmasked tokens during training) / (fraction of unmasked tokens in sample).
+        # This is analogous to the way that dropout layers scale down outputs during evaluation when not
+        # actually dropping out values (or, equivalently, scale up their un-dropped outputs in training).
+        if self.token_dropout and input_ids is not None:
+            embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
+            mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all EVOLLA_SA_PROT model training runs
+            src_lengths = attention_mask.sum(-1) if attention_mask is not None else input_ids.shape[1]
+            mask_ratio_observed = (input_ids == self.mask_token_id).sum(-1).float() / src_lengths
+            embeddings = (embeddings * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]).to(
+                embeddings.dtype
+            )
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = embeddings + position_embeddings
+
+        if self.layer_norm is not None:
+            embeddings = self.layer_norm(embeddings)
+        if attention_mask is not None:
+            embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)
+        # Matt: I think this line was copied incorrectly from BERT, disabling it for now.
+        # embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+def rotate_half_esm(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_esm(x, cos, sin):
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+
+    return (x * cos) + (rotate_half_esm(x) * sin)
+
+
+class EvollaSaProtRotaryEmbedding(nn.Module):
+    """
+    Rotary position embeddings based on those in
+    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
+    matrices which depend on their relative positions.
+    """
+
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        inv_freq = inv_freq
+        self.register_buffer("inv_freq", inv_freq)
+
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+
+    def _update_cos_sin_tables(self, x, seq_dimension=2):
+        seq_len = x.shape[seq_dimension]
+
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
+            self._seq_len_cached = seq_len
+            t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+
+            self._cos_cached = emb.cos()[None, None, :, :]
+            self._sin_cached = emb.sin()[None, None, :, :]
+
+        return self._cos_cached, self._sin_cached
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)
+
+        return (
+            apply_rotary_pos_emb_esm(q, self._cos_cached, self._sin_cached).to(dtype=q.dtype),
+            apply_rotary_pos_emb_esm(k, self._cos_cached, self._sin_cached).to(dtype=k.dtype),
+        )
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    # EVOLLA_SA_PROT applies relative position embeddings and we don't copy from Llama
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+
+    if hasattr(module, "position_embedding_type") and module.position_embedding_type in [
+        "relative_key",
+        "relative_key_query",
+    ]:
+        seq_length = query.shape[2]
+        position_ids_l = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(-1, 1)
+        position_ids_r = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(1, -1)
+        distance = position_ids_l - position_ids_r
+        positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1)
+        positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+        if module.position_embedding_type == "relative_key":
+            relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+        elif module.position_embedding_type == "relative_key_query":
+            relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding)
+            relative_position_scores = relative_position_scores_query + relative_position_scores_key
+
+        attn_weights = attn_weights + relative_position_scores
+
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class EvollaSaProtSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = config.attention_probs_dropout_prob
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        self.rotary_embeddings = None
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        elif self.position_embedding_type == "rotary":
+            self.rotary_embeddings = EvollaSaProtRotaryEmbedding(dim=self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+        self.scaling = 1.0
+        self.is_causal = self.is_decoder and not is_cross_attention
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        hidden_shape = (batch_size, seq_length, -1, self.attention_head_size)
+
+        query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+        key_layer = self.key(current_states).view(hidden_shape).transpose(1, 2)
+        value_layer = self.value(current_states).view(hidden_shape).transpose(1, 2)
+
+        # Matt: Our BERT model (which this code was derived from) scales attention logits down by sqrt(head_dim).
+        # EVOLLA_SA_PROT scales the query down by the same factor instead. Modulo numerical stability these are equivalent,
+        # but not when rotary embeddings get involved. Therefore, we scale the query here to match the original
+        # EVOLLA_SA_PROT code and fix rotary embeddings.
+        query_layer = query_layer * self.attention_head_size**-0.5
+
+        if self.position_embedding_type == "rotary":
+            query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.position_embedding_type in ["relative_key", "relative_key_query"]:
+                raise ValueError(
+                    f"ESM {self.config._attn_implementation} attention does not support {self.position_embedding_type} embeddings. "
+                    "Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`"
+                )
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        return attn_output, attn_weights
+
+
+class EvollaSaProtSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class EvollaSaProtAttention(nn.Module):
+    def __init__(self, config, layer_idx=None, is_cross_attention=False):
+        super().__init__()
+        self.self = EvollaSaProtSelfAttention(config, layer_idx=layer_idx, is_cross_attention=is_cross_attention)
+        self.output = EvollaSaProtSelfOutput(config)
+        self.pruned_heads = set()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        hidden_states_ln = self.LayerNorm(hidden_states)
+        attn_output, _ = self.self(
+            hidden_states_ln,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            **kwargs,
+        )
+        attn_output = self.output(attn_output, hidden_states)
+        return attn_output
+
+
+def gelu(x):
+    """
+    This is the gelu implementation from the original EVOLLA_SA_PROT repo. Using F.gelu yields subtly wrong results.
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+class EvollaSaProtIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = gelu(hidden_states)
+        return hidden_states
+
+
+class EvollaSaProtOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class EvollaSaProtLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = EvollaSaProtAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise RuntimeError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = EvollaSaProtAttention(config, is_cross_attention=True)
+        self.intermediate = EvollaSaProtIntermediate(config)
+        self.output = EvollaSaProtOutput(config)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise AttributeError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated"
+                    " with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            attention_output = self.crossattention(
+                attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                **kwargs,
+            )
+
+        layer_output = self.feed_forward_chunk(attention_output)
+        return layer_output
+
+    def feed_forward_chunk(self, attention_output):
+        attention_output_ln = self.LayerNorm(attention_output)
+        intermediate_output = self.intermediate(attention_output_ln)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class EvollaSaProtEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([EvollaSaProtLayer(config) for _ in range(config.num_hidden_layers)])
+        self.emb_layer_norm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                **kwargs,
+            )
+
+        if self.emb_layer_norm_after:
+            hidden_states = self.emb_layer_norm_after(hidden_states)
+
+        return BaseModelOutputWithCrossAttentions(last_hidden_state=hidden_states)
+
+
+class EvollaSaProtPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class EvollaSaProtPreTrainedModel(PreTrainedModel):
+    config: SaProtConfig
+    _no_split_modules = ["EvollaSaProtLayer"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": EvollaSaProtLayer,
+        "attentions": [OutputRecorder(EvollaSaProtSelfAttention, index=1, layer_name="attention")],
+        "cross_attentions": [
+            OutputRecorder(EvollaSaProtSelfAttention, index=1, layer_name="crossattention"),
+        ],
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class EvollaSaProtProteinEncoder(EvollaSaProtPreTrainedModel):
+    def __init__(self, config: SaProtConfig):
+        super().__init__(config)
+        self.embeddings = EvollaSaProtEmbeddings(config)
+        self.encoder = EvollaSaProtEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        input_shape = input_ids.size()
+        batch_size, seq_length = input_shape
+
+        device = input_ids.device
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        inputs_embeds = self.embeddings(input_ids=input_ids, attention_mask=attention_mask)
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        encoder_outputs = self.encoder(inputs_embeds, attention_mask=extended_attention_mask)
+        sequence_output = encoder_outputs[0]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def get_extended_attention_mask(
+        self, attention_mask: Tensor, input_shape: tuple[int], device: torch.device = None, dtype: torch.float = None
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        if dtype is None:
+            dtype = get_parameter_dtype(self)
+
+        if not (attention_mask.dim() == 2 and self.config.is_decoder):
+            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
+            if device is not None:
+                warnings.warn(
+                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+                )
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
+                    input_shape, attention_mask, device
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
+        return extended_attention_mask
+
+
+class EvollaSequenceCompressorAttention(nn.Module):
+    def __init__(self, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents, mask):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D);  n2: num of latent tokens
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+
+        h = self.heads
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(
+            2, dim=-1
+        )  # each: batch_size, max_protein_length+num_latents, dim_head*num_heads
+
+        q = q.view(q.size(0), q.size(1), h, -1).permute(0, 2, 1, 3)
+        k = k.view(k.size(0), k.size(1), h, -1).permute(0, 2, 1, 3)
+        v = v.view(v.size(0), v.size(1), h, -1).permute(0, 2, 1, 3)
+        q = q * self.scale  # batch_size, num_heads, num_latents, dim_head
+
+        # attention
+        sim = torch.matmul(q, k.transpose(-1, -2))
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        bs, nh, skd, okd = sim.shape
+        ones = torch.ones(nh, skd).to(mask.device)  # Create a tensor of ones with shape (nh, skd)
+        mask_exp = mask[:, None, None, :]
+        ones_exp = ones[None, :, :, None]
+        mask = mask_exp * ones_exp
+
+        sim = sim.masked_fill((1 - mask).bool(), -1e4)
+        attn = sim.softmax(dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.permute(0, 2, 1, 3)
+
+        # [batch, seq, head, features] -> [batch, seq, head*features]
+        out = out.reshape(out.size(0), out.size(1), -1)
+
+        return self.to_out(out)
+
+
+class EvollaFeedForward(nn.Module):
+    def __init__(self, dim, mult=4):
+        super().__init__()
+        inner_dim = int(dim * mult)
+
+        self.norm = nn.LayerNorm(dim)
+        self.fc1 = nn.Linear(dim, inner_dim, bias=False)
+        self.activation = nn.GELU()
+        self.fc2 = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x):
+        return self.fc2(self.activation(self.fc1(self.norm(x))))
+
+
+class EvollaSequenceCompressorResampler(nn.Module):
+    def __init__(self, config: EvollaConfig):
+        super().__init__()
+        protein_repr_dim = config.protein_encoder_config.hidden_size
+        self.num_latents = config.resampler_num_latents
+        self.latents = nn.Parameter(torch.randn(self.num_latents, protein_repr_dim), requires_grad=True)
+        self.layers = nn.ModuleList([])
+        for _ in range(config.resampler_depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        EvollaSequenceCompressorAttention(
+                            dim=protein_repr_dim, dim_head=config.resampler_dim_head, heads=config.resampler_heads
+                        ),
+                        EvollaFeedForward(dim=protein_repr_dim, mult=config.resampler_ff_mult),
+                    ]
+                )
+            )
+
+        self.norm = nn.LayerNorm(config.hidden_size)
+        self.protein_projector = nn.Linear(protein_repr_dim, config.hidden_size)
+
+    def forward(self, embeds, mask):
+        b = embeds.shape[0]
+
+        bs, _ = mask.shape  # bs, max_protein_length
+        latent_mask = torch.ones(bs, self.num_latents).to(mask.device)
+        mask = torch.cat((mask, latent_mask), dim=1)  # bs, max_protein_length + num_latents
+
+        # blocks
+        ones = torch.ones(b).to(self.latents.device)
+        latents = self.latents[None] * ones.view(-1, 1, 1)  # [b,n,d]
+        latents = latents.to(embeds.dtype)
+        for attn, ff in self.layers:
+            latents = attn(embeds, latents, mask) + latents
+            latents = ff(latents) + latents
+
+        transformed_feature = self.protein_projector(latents)
+
+        return self.norm(transformed_feature)
+
+
+@dataclass
+@auto_docstring
+class EvollaProteinEncoderModelOutput(ModelOutput):
+    sequence_compressor_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class EvollaProteinEncoder(nn.Module):
+    def __init__(self, config: EvollaConfig):
+        super().__init__()
+        self.model = EvollaSaProtProteinEncoder(config=config.protein_encoder_config)
+        self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config)
+
+    @can_return_tuple
+    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs):
+        protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        protein_embeds = protein_output.last_hidden_state
+        sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask)
+
+        return EvollaProteinEncoderModelOutput(
+            sequence_compressor_output=sequence_repr,
+            last_hidden_state=protein_output.last_hidden_state,
+        )
+
+
+class EvollaSequenceAlignerCrossAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        protein_encoder_dim: Optional[int] = None,
+        structure_encoder_dim: Optional[int] = None,
+        msa_encoder_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.scale = self.num_attention_heads**-0.5
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        attention_probs_dropout_prob = config.aligner_attention_probs_dropout_prob
+        enable_bias = config.aligner_enable_bias
+        ffn_mult = config.aligner_ffn_mult
+
+        self.query = nn.Linear(self.hidden_size, self.all_head_size)
+        if protein_encoder_dim is not None:
+            self.key_protein = nn.Linear(protein_encoder_dim, self.all_head_size)
+            self.value_protein = nn.Linear(protein_encoder_dim, self.all_head_size)
+        else:
+            self.key_protein = None
+            self.value_protein = None
+
+        if structure_encoder_dim is not None:
+            self.key_structure = nn.Linear(structure_encoder_dim, self.all_head_size)
+            self.value_structure = nn.Linear(structure_encoder_dim, self.all_head_size)
+        else:
+            self.key_structure = None
+            self.value_structure = None
+
+        if msa_encoder_dim is not None:
+            self.key_msa = nn.Linear(msa_encoder_dim, self.all_head_size)
+            self.value_msa = nn.Linear(msa_encoder_dim, self.all_head_size)
+        else:
+            self.key_msa = None
+            self.value_msa = None
+
+        self.attention_norm = EvollaRMSNorm(self.hidden_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+
+        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=enable_bias)
+
+        self.ff = EvollaFeedForward(self.hidden_size, ffn_mult)
+        self.gate_attention = nn.Parameter(torch.tensor([0.0]))
+        self.gate_ffw = nn.Parameter(torch.tensor([0.0]))
+
+    def cross_attention(
+        self,
+        query_states,
+        protein_key_value_states,
+        structure_key_value_states,
+        msa_key_value_states,
+        query_attn_mask,
+        protein_kv_attn_mask,
+        structure_kv_attn_mask,
+        msa_kv_attn_mask,
+    ):
+        """
+        query_states: text
+        key_value_states: protein
+        query_states: [bs, query_seq_len, dim]
+        key_value_states: [bs, kv_seq_len, dim]
+        query_attn_mask: [bs, query_seq_len]
+        kv_attn_mask: [bs, kv_seq_len]
+        """
+
+        # Concatenate protein and structure
+        kv_attn_mask = [protein_kv_attn_mask, structure_kv_attn_mask, msa_kv_attn_mask]
+        kv_attn_mask = [_ for _ in kv_attn_mask if _ is not None]
+        if not kv_attn_mask:
+            raise ValueError("At least one modality should be provided for cross attention.")
+        kv_attn_mask = torch.cat(kv_attn_mask, dim=1)
+
+        query_layer = self.attention_norm(query_states)
+
+        # Warning: This place might cause issues, refers to
+        # https://discuss.pytorch.org/t/cuda-error-cublas-status-not-supported-when-calling-cublasltmatmul-from-torch-nn-functional-linear/170214/13
+        # Solution: add `DISABLE_ADDMM_CUDA_LT=1` as environment variable
+        # Apply linear transformation to input_query, input_key, and input_value
+        query_layer = self.query(query_layer)  # [bs, querylength, dim]
+
+        if self.key_protein is not None and self.value_protein is not None:
+            protein_key_value_states = protein_key_value_states.to(query_states)
+            key_layer_protein = self.key_protein(protein_key_value_states)  # [bs, keylength, dim]
+            value_layer_protein = self.value_protein(protein_key_value_states)  # [bs, keylength, dim]
+        else:
+            key_layer_protein = None
+            value_layer_protein = None
+
+        if self.key_structure is not None and self.value_structure is not None:
+            structure_key_value_states = structure_key_value_states.to(query_states)
+            key_layer_structure = self.key_structure(structure_key_value_states)  # [bs, keylength, dim]
+            value_layer_structure = self.value_structure(structure_key_value_states)  # [bs, keylength, dim]
+        else:
+            key_layer_structure = None
+            value_layer_structure = None
+
+        if self.key_msa is not None and self.value_msa is not None:
+            msa_key_value_states = msa_key_value_states.to(query_states)
+            key_layer_msa = self.key_msa(msa_key_value_states)  # [bs, keylength, dim]
+            value_layer_msa = self.value_msa(msa_key_value_states)  # [bs, keylength, dim]
+        else:
+            key_layer_msa = None
+            value_layer_msa = None
+
+        key_layer = [key_layer_protein, key_layer_structure, key_layer_msa]
+        key_layer = [_ for _ in key_layer if _ is not None]
+        key_layer = torch.cat(key_layer, dim=1)
+
+        value_layer = [value_layer_protein, value_layer_structure, value_layer_msa]
+        value_layer = [_ for _ in value_layer if _ is not None]
+        value_layer = torch.cat(value_layer, dim=1)
+
+        new_query_layer_shape = query_layer.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        query_layer = query_layer.view(*new_query_layer_shape).permute(0, 2, 1, 3)
+
+        new_key_layer_shape = key_layer.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        key_layer = key_layer.view(*new_key_layer_shape).permute(0, 2, 1, 3)
+
+        new_value_layer_shape = value_layer.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        value_layer = value_layer.view(*new_value_layer_shape).permute(0, 2, 1, 3)
+
+        query_layer = query_layer * self.scale
+
+        # attention_mask: [bs, 1, querylength, keylength]
+        if query_attn_mask is None:
+            query_attn_mask = torch.ones(query_states.size(0), query_states.size(1)).to(query_states.device)
+        attention_mask = query_attn_mask[:, None, :, None] * kv_attn_mask[:, None, None, :]
+        # Compute the scaled dot-product attention scores
+        attn_weights = torch.matmul(query_layer, key_layer.transpose(-1, -2))  # [bs, numheads, querylength, keylength]
+        attn_weights = attn_weights - attn_weights.amax(dim=-1, keepdim=True).detach()  # To stablize score
+        attention_scores = attn_weights.masked_fill(
+            (1 - attention_mask).bool(), torch.finfo(attn_weights.dtype).min
+        )  # [bs, numheads, querylength, keylength]
+
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # attention_probs_dropped = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)  # [bs, numheads, querylength, dim/numheads]
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        return context_layer
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        query_states,
+        protein_kv_states,
+        structure_kv_states,
+        msa_kv_states,
+        query_attn_mask,
+        protein_kv_attn_mask=None,
+        structure_kv_attn_mask=None,
+        msa_kv_attn_mask=None,
+        protein_batch_mask=None,
+        structure_batch_mask=None,
+        msa_batch_mask=None,
+        past_key_values=None,
+    ):
+        if protein_kv_states is not None:
+            bs, protein_kv_seq_len, dim = protein_kv_states.shape
+            if protein_kv_attn_mask is None:
+                protein_kv_attn_mask = (
+                    torch.ones(bs, protein_kv_seq_len).to(protein_batch_mask.device)
+                    * protein_batch_mask.expand(size=(protein_kv_seq_len, bs)).T
+                ).to(protein_kv_states.device)
+        else:
+            protein_kv_attn_mask = None
+
+        if structure_kv_states is not None:
+            bs, structure_kv_seq_len, dim = structure_kv_states.shape
+            if structure_kv_attn_mask is None:
+                structure_kv_attn_mask = (
+                    torch.ones(bs, structure_kv_seq_len).to(protein_batch_mask.device)
+                    * structure_batch_mask.expand(size=(structure_kv_seq_len, bs)).T
+                ).to(structure_kv_states.device)
+        else:
+            structure_kv_attn_mask = None
+
+        if msa_kv_states is not None:
+            bs, msa_kv_seq_len, dim = msa_kv_states.shape
+            if msa_kv_attn_mask is None:
+                msa_kv_attn_mask = (
+                    torch.ones(bs, msa_kv_seq_len).to(protein_batch_mask.device)
+                    * msa_batch_mask.expand(size=(msa_kv_seq_len, bs)).T
+                ).to(msa_kv_states.device)
+        else:
+            msa_kv_attn_mask = None
+        hidden_states = query_states
+        # only when there's at least one valid modality, crossattention will be performed
+        if (
+            (protein_kv_states is not None and protein_kv_attn_mask.any())
+            or (structure_kv_states is not None and structure_kv_attn_mask.any())
+            or (msa_kv_states is not None and msa_kv_attn_mask.any())
+        ):
+            residual = hidden_states
+            hidden_states = self.cross_attention(
+                query_states=hidden_states,
+                protein_key_value_states=protein_kv_states,
+                structure_key_value_states=structure_kv_states,
+                msa_key_value_states=msa_kv_states,
+                query_attn_mask=query_attn_mask,
+                protein_kv_attn_mask=protein_kv_attn_mask,
+                structure_kv_attn_mask=structure_kv_attn_mask,
+                msa_kv_attn_mask=msa_kv_attn_mask,
+            )  # [bs, query_seq_len, dim]
+            # tanh gate
+            hidden_states = torch.tanh(self.gate_attention) * hidden_states
+
+            hidden_states = residual + hidden_states  # input_query
+
+            residual = hidden_states
+            hidden_states = self.ff(hidden_states) * torch.tanh(self.gate_ffw)
+            hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class EvollaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        EvollaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class EvollaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: EvollaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class EvollaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class EvollaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: EvollaConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class EvollaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: EvollaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = EvollaAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = EvollaMLP(config)
+        self.input_layernorm = EvollaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = EvollaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if (layer_idx + 1) % max(config.num_hidden_layers // config.aligner_num_add_layers, 1) == 0:
+            self.adapter = EvollaSequenceAlignerCrossAttention(
+                config,
+                protein_encoder_dim=config.hidden_size,
+            )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        protein_kv_states: Optional[torch.Tensor] = None,
+        structure_kv_states: Optional[torch.Tensor] = None,
+        msa_kv_states: Optional[torch.Tensor] = None,
+        protein_batch_mask: Optional[torch.Tensor] = None,
+        structure_batch_mask: Optional[torch.Tensor] = None,
+        msa_batch_mask: Optional[torch.Tensor] = None,
+        query_attn_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        if hasattr(self, "adapter"):
+            hidden_states = self.adapter(
+                query_states=hidden_states,
+                protein_kv_states=protein_kv_states,
+                structure_kv_states=structure_kv_states,
+                msa_kv_states=msa_kv_states,
+                query_attn_mask=query_attn_mask,
+                protein_batch_mask=protein_batch_mask,
+                structure_batch_mask=structure_batch_mask,
+                msa_batch_mask=msa_batch_mask,
+            )
+
+        return hidden_states
+
+
+@auto_docstring
+class EvollaPreTrainedModel(PreTrainedModel):
+    config: EvollaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "EvollaDecoderLayer",
+        "EvollaSequenceCompressorResampler",
+        "EvollaSequenceAlignerCrossAttention",
+    ]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = False  # see dependency on `EvollaSaProtProteinEncoder`
+    _supports_sdpa = True
+    _supports_flex_attn = False  # see dependency on `EvollaSaProtProteinEncoder`
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = False
+    _can_record_outputs = {
+        "hidden_states": EvollaDecoderLayer,
+        "attentions": EvollaAttention,
+    }
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        super()._init_weights(module)
+        if isinstance(module, EvollaSequenceAlignerCrossAttention):
+            module.gate_attention.zero_()
+            module.gate_ffw.zero_()
+            module.attention_norm.weight.data.fill_(1.0)
+        elif isinstance(module, EvollaSequenceCompressorResampler):
+            module.latents.data.normal_(mean=0.0, std=std)
+
+
+class EvollaModel(EvollaPreTrainedModel):
+    def __init__(self, config: EvollaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
+        self.protein_encoder = EvollaProteinEncoder(config=config)
+        self.layers = nn.ModuleList(
+            [
+                EvollaDecoderLayer(
+                    config=config,
+                    layer_idx=layer_idx,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.norm = EvollaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = EvollaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = getattr(config, "gradient_checkpointing", False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @auto_docstring
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        protein_input_ids: Optional[torch.LongTensor] = None,
+        protein_attention_mask: Optional[torch.Tensor] = None,
+        structure_feats: Optional[torch.FloatTensor] = None,
+        msa_feats: Optional[torch.FloatTensor] = None,
+        structure_batch_mask: Optional[torch.Tensor] = None,
+        msa_batch_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        protein_input_ids (torch.LongTensor):
+            The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
+        protein_attention_mask (torch.Tensor):
+            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
+        structure_feats (torch.FloatTensor):
+            The input IDs for purely structure-based features. Should be of shape `(batch_size, structure_seq_length, structure_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
+        msa_feats (torch.FloatTensor):
+            The input IDs for purely MSA-based features. Should be of shape `(batch_size, msa_seq_length, msa_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
+        structure_batch_mask (torch.Tensor):
+            The batch mask to decide which protein sequences are purely structure-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `structure_feats`. Dummpy input for now.
+        msa_batch_mask (torch.Tensor):
+            The batch mask to decide which protein sequences are purely MSA-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `msa_feats`. Dummpy input for now.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        protein_feats = None
+        protein_batch_mask = None
+        # If provided, actually compute them
+        if protein_input_ids is not None and protein_attention_mask is not None:
+            protein_outputs = self.protein_encoder(
+                input_ids=protein_input_ids,
+                attention_mask=protein_attention_mask,
+            )
+            protein_feats = protein_outputs.sequence_compressor_output
+            protein_batch_mask = torch.tensor([True] * protein_input_ids.shape[0], device=protein_input_ids.device)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                protein_kv_states=protein_feats,
+                structure_kv_states=structure_feats,
+                msa_kv_states=msa_feats,
+                protein_batch_mask=protein_batch_mask,
+                structure_batch_mask=structure_batch_mask,
+                msa_batch_mask=msa_batch_mask,
+                query_attn_mask=attention_mask,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+        return output
+
+
+class EvollaForProteinText2Text(EvollaPreTrainedModel, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = EvollaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, self.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        return self.model.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,  # text input ids
+        attention_mask: Optional[torch.Tensor] = None,  # text attention mask
+        inputs_embeds: Optional[torch.FloatTensor] = None,  # text input embeddings
+        labels: Optional[torch.LongTensor] = None,
+        protein_input_ids: torch.LongTensor = None,
+        protein_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ):
+        r"""
+        protein_input_ids (torch.LongTensor):
+            The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
+        protein_attention_mask (torch.Tensor):
+            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
+
+        Example:
+
+        ```python
+        >>> from transformers import EvollaProcessor, EvollaForProteinText2Text
+        >>> model = EvollaForProteinText2Text.from_pretrained("westlake/Evolla-10B-hf")
+        >>> processor = EvollaProcessor.from_pretrained("westlake/Evolla-10B-hf")
+
+        >>> protein_information = {
+            "aa_seq": "your amino acid sequence",
+            "foldseek": "your foldseek sequence",
+        }
+        >>> question = "What is the function of this protein?"
+        >>> message = [
+            {"role": "system", "content": "You are an AI expert that can answer any questions about protein."},
+            {"role": "user", "content": question},
+        ]
+
+        >>> inputs = processor(proteins=[protein_information], messages_list=[message], return_tensors="pt", padding="longest")
+        >>> outputs = model.generate(**inputs)
+
+        >>> print(processor.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            protein_input_ids=protein_input_ids,
+            protein_attention_mask=protein_attention_mask,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
+
+        lm_outputs = CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+        return lm_outputs
+
+
+__all__ = ["EvollaForProteinText2Text", "EvollaModel", "EvollaPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/modular_evolla.py b/phivenv/Lib/site-packages/transformers/models/evolla/modular_evolla.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2790a2ca93f61c8e8d11865b1bb974dbc7f219d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/evolla/modular_evolla.py
@@ -0,0 +1,1021 @@
+# coding=utf-8
+# Copyright 2025 Westlake Representational Learning Lab (Fajie Yuan Lab) team and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithPast,
+    ModelOutput,
+)
+from ...modeling_utils import ModuleUtilsMixin, PreTrainedModel, get_parameter_dtype
+from ...utils import (
+    auto_docstring,
+    can_return_tuple,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder, check_model_inputs
+from ..esm.modeling_esm import (
+    EsmAttention,
+    EsmEmbeddings,
+    EsmEncoder,
+    EsmIntermediate,
+    EsmLayer,
+    EsmOutput,
+    EsmPooler,
+    EsmSelfAttention,
+    EsmSelfOutput,
+)
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaMLP,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+)
+from .configuration_evolla import EvollaConfig, SaProtConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class EvollaSaProtEmbeddings(EsmEmbeddings):
+    def __init__(self, config):
+        super().__init__(config)
+        # remove the position_ids in EsmEmbeddings
+        self.position_ids = None
+
+
+def rotate_half_esm(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_esm(x, cos, sin):
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+
+    return (x * cos) + (rotate_half_esm(x) * sin)
+
+
+class EvollaSaProtRotaryEmbedding(nn.Module):
+    """
+    Rotary position embeddings based on those in
+    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
+    matrices which depend on their relative positions.
+    """
+
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        inv_freq = inv_freq
+        self.register_buffer("inv_freq", inv_freq)
+
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+
+    def _update_cos_sin_tables(self, x, seq_dimension=2):
+        seq_len = x.shape[seq_dimension]
+
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
+            self._seq_len_cached = seq_len
+            t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+
+            self._cos_cached = emb.cos()[None, None, :, :]
+            self._sin_cached = emb.sin()[None, None, :, :]
+
+        return self._cos_cached, self._sin_cached
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)
+
+        return (
+            apply_rotary_pos_emb_esm(q, self._cos_cached, self._sin_cached).to(dtype=q.dtype),
+            apply_rotary_pos_emb_esm(k, self._cos_cached, self._sin_cached).to(dtype=k.dtype),
+        )
+
+
+class EvollaSaProtSelfAttention(EsmSelfAttention):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cross_attention=False):
+        nn.Module.__init__(self)
+        self.config = config
+
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = config.attention_probs_dropout_prob
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        self.rotary_embeddings = None
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        elif self.position_embedding_type == "rotary":
+            self.rotary_embeddings = EvollaSaProtRotaryEmbedding(dim=self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+        self.scaling = 1.0
+        self.is_causal = self.is_decoder and not is_cross_attention
+
+
+class EvollaSaProtSelfOutput(EsmSelfOutput):
+    pass
+
+
+class EvollaSaProtAttention(EsmAttention):
+    pass
+
+
+class EvollaSaProtIntermediate(EsmIntermediate):
+    pass
+
+
+class EvollaSaProtOutput(EsmOutput):
+    pass
+
+
+class EvollaSaProtLayer(EsmLayer):
+    pass
+
+
+class EvollaSaProtEncoder(EsmEncoder):
+    pass
+
+
+class EvollaSaProtPooler(EsmPooler):
+    pass
+
+
+@auto_docstring
+class EvollaSaProtPreTrainedModel(PreTrainedModel):
+    config: SaProtConfig
+    _no_split_modules = ["EvollaSaProtLayer"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": EvollaSaProtLayer,
+        "attentions": [OutputRecorder(EvollaSaProtSelfAttention, index=1, layer_name="attention")],
+        "cross_attentions": [
+            OutputRecorder(EvollaSaProtSelfAttention, index=1, layer_name="crossattention"),
+        ],
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class EvollaSaProtProteinEncoder(EvollaSaProtPreTrainedModel):
+    def __init__(self, config: SaProtConfig):
+        super().__init__(config)
+        self.embeddings = EvollaSaProtEmbeddings(config)
+        self.encoder = EvollaSaProtEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        input_shape = input_ids.size()
+        batch_size, seq_length = input_shape
+
+        device = input_ids.device
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        inputs_embeds = self.embeddings(input_ids=input_ids, attention_mask=attention_mask)
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        encoder_outputs = self.encoder(inputs_embeds, attention_mask=extended_attention_mask)
+        sequence_output = encoder_outputs[0]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def get_extended_attention_mask(
+        self, attention_mask: Tensor, input_shape: tuple[int], device: torch.device = None, dtype: torch.float = None
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        if dtype is None:
+            dtype = get_parameter_dtype(self)
+
+        if not (attention_mask.dim() == 2 and self.config.is_decoder):
+            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
+            if device is not None:
+                warnings.warn(
+                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+                )
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
+                    input_shape, attention_mask, device
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and the dtype's smallest value for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
+        return extended_attention_mask
+
+
+class EvollaSequenceCompressorAttention(nn.Module):
+    def __init__(self, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents, mask):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D);  n2: num of latent tokens
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+
+        h = self.heads
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(
+            2, dim=-1
+        )  # each: batch_size, max_protein_length+num_latents, dim_head*num_heads
+
+        q = q.view(q.size(0), q.size(1), h, -1).permute(0, 2, 1, 3)
+        k = k.view(k.size(0), k.size(1), h, -1).permute(0, 2, 1, 3)
+        v = v.view(v.size(0), v.size(1), h, -1).permute(0, 2, 1, 3)
+        q = q * self.scale  # batch_size, num_heads, num_latents, dim_head
+
+        # attention
+        sim = torch.matmul(q, k.transpose(-1, -2))
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        bs, nh, skd, okd = sim.shape
+        ones = torch.ones(nh, skd).to(mask.device)  # Create a tensor of ones with shape (nh, skd)
+        mask_exp = mask[:, None, None, :]
+        ones_exp = ones[None, :, :, None]
+        mask = mask_exp * ones_exp
+
+        sim = sim.masked_fill((1 - mask).bool(), -1e4)
+        attn = sim.softmax(dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.permute(0, 2, 1, 3)
+
+        # [batch, seq, head, features] -> [batch, seq, head*features]
+        out = out.reshape(out.size(0), out.size(1), -1)
+
+        return self.to_out(out)
+
+
+class EvollaFeedForward(nn.Module):
+    def __init__(self, dim, mult=4):
+        super().__init__()
+        inner_dim = int(dim * mult)
+
+        self.norm = nn.LayerNorm(dim)
+        self.fc1 = nn.Linear(dim, inner_dim, bias=False)
+        self.activation = nn.GELU()
+        self.fc2 = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x):
+        return self.fc2(self.activation(self.fc1(self.norm(x))))
+
+
+class EvollaSequenceCompressorResampler(nn.Module):
+    def __init__(self, config: EvollaConfig):
+        super().__init__()
+        protein_repr_dim = config.protein_encoder_config.hidden_size
+        self.num_latents = config.resampler_num_latents
+        self.latents = nn.Parameter(torch.randn(self.num_latents, protein_repr_dim), requires_grad=True)
+        self.layers = nn.ModuleList([])
+        for _ in range(config.resampler_depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        EvollaSequenceCompressorAttention(
+                            dim=protein_repr_dim, dim_head=config.resampler_dim_head, heads=config.resampler_heads
+                        ),
+                        EvollaFeedForward(dim=protein_repr_dim, mult=config.resampler_ff_mult),
+                    ]
+                )
+            )
+
+        self.norm = nn.LayerNorm(config.hidden_size)
+        self.protein_projector = nn.Linear(protein_repr_dim, config.hidden_size)
+
+    def forward(self, embeds, mask):
+        b = embeds.shape[0]
+
+        bs, _ = mask.shape  # bs, max_protein_length
+        latent_mask = torch.ones(bs, self.num_latents).to(mask.device)
+        mask = torch.cat((mask, latent_mask), dim=1)  # bs, max_protein_length + num_latents
+
+        # blocks
+        ones = torch.ones(b).to(self.latents.device)
+        latents = self.latents[None] * ones.view(-1, 1, 1)  # [b,n,d]
+        latents = latents.to(embeds.dtype)
+        for attn, ff in self.layers:
+            latents = attn(embeds, latents, mask) + latents
+            latents = ff(latents) + latents
+
+        transformed_feature = self.protein_projector(latents)
+
+        return self.norm(transformed_feature)
+
+
+@dataclass
+@auto_docstring
+class EvollaProteinEncoderModelOutput(ModelOutput):
+    sequence_compressor_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class EvollaProteinEncoder(nn.Module):
+    def __init__(self, config: EvollaConfig):
+        super().__init__()
+        self.model = EvollaSaProtProteinEncoder(config=config.protein_encoder_config)
+        self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config)
+
+    @can_return_tuple
+    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs):
+        protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        protein_embeds = protein_output.last_hidden_state
+        sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask)
+
+        return EvollaProteinEncoderModelOutput(
+            sequence_compressor_output=sequence_repr,
+            last_hidden_state=protein_output.last_hidden_state,
+        )
+
+
+class EvollaSequenceAlignerCrossAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        protein_encoder_dim: Optional[int] = None,
+        structure_encoder_dim: Optional[int] = None,
+        msa_encoder_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.scale = self.num_attention_heads**-0.5
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        attention_probs_dropout_prob = config.aligner_attention_probs_dropout_prob
+        enable_bias = config.aligner_enable_bias
+        ffn_mult = config.aligner_ffn_mult
+
+        self.query = nn.Linear(self.hidden_size, self.all_head_size)
+        if protein_encoder_dim is not None:
+            self.key_protein = nn.Linear(protein_encoder_dim, self.all_head_size)
+            self.value_protein = nn.Linear(protein_encoder_dim, self.all_head_size)
+        else:
+            self.key_protein = None
+            self.value_protein = None
+
+        if structure_encoder_dim is not None:
+            self.key_structure = nn.Linear(structure_encoder_dim, self.all_head_size)
+            self.value_structure = nn.Linear(structure_encoder_dim, self.all_head_size)
+        else:
+            self.key_structure = None
+            self.value_structure = None
+
+        if msa_encoder_dim is not None:
+            self.key_msa = nn.Linear(msa_encoder_dim, self.all_head_size)
+            self.value_msa = nn.Linear(msa_encoder_dim, self.all_head_size)
+        else:
+            self.key_msa = None
+            self.value_msa = None
+
+        self.attention_norm = EvollaRMSNorm(self.hidden_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+
+        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=enable_bias)
+
+        self.ff = EvollaFeedForward(self.hidden_size, ffn_mult)
+        self.gate_attention = nn.Parameter(torch.tensor([0.0]))
+        self.gate_ffw = nn.Parameter(torch.tensor([0.0]))
+
+    def cross_attention(
+        self,
+        query_states,
+        protein_key_value_states,
+        structure_key_value_states,
+        msa_key_value_states,
+        query_attn_mask,
+        protein_kv_attn_mask,
+        structure_kv_attn_mask,
+        msa_kv_attn_mask,
+    ):
+        """
+        query_states: text
+        key_value_states: protein
+        query_states: [bs, query_seq_len, dim]
+        key_value_states: [bs, kv_seq_len, dim]
+        query_attn_mask: [bs, query_seq_len]
+        kv_attn_mask: [bs, kv_seq_len]
+        """
+
+        # Concatenate protein and structure
+        kv_attn_mask = [protein_kv_attn_mask, structure_kv_attn_mask, msa_kv_attn_mask]
+        kv_attn_mask = [_ for _ in kv_attn_mask if _ is not None]
+        if not kv_attn_mask:
+            raise ValueError("At least one modality should be provided for cross attention.")
+        kv_attn_mask = torch.cat(kv_attn_mask, dim=1)
+
+        query_layer = self.attention_norm(query_states)
+
+        # Warning: This place might cause issues, refers to
+        # https://discuss.pytorch.org/t/cuda-error-cublas-status-not-supported-when-calling-cublasltmatmul-from-torch-nn-functional-linear/170214/13
+        # Solution: add `DISABLE_ADDMM_CUDA_LT=1` as environment variable
+        # Apply linear transformation to input_query, input_key, and input_value
+        query_layer = self.query(query_layer)  # [bs, querylength, dim]
+
+        if self.key_protein is not None and self.value_protein is not None:
+            protein_key_value_states = protein_key_value_states.to(query_states)
+            key_layer_protein = self.key_protein(protein_key_value_states)  # [bs, keylength, dim]
+            value_layer_protein = self.value_protein(protein_key_value_states)  # [bs, keylength, dim]
+        else:
+            key_layer_protein = None
+            value_layer_protein = None
+
+        if self.key_structure is not None and self.value_structure is not None:
+            structure_key_value_states = structure_key_value_states.to(query_states)
+            key_layer_structure = self.key_structure(structure_key_value_states)  # [bs, keylength, dim]
+            value_layer_structure = self.value_structure(structure_key_value_states)  # [bs, keylength, dim]
+        else:
+            key_layer_structure = None
+            value_layer_structure = None
+
+        if self.key_msa is not None and self.value_msa is not None:
+            msa_key_value_states = msa_key_value_states.to(query_states)
+            key_layer_msa = self.key_msa(msa_key_value_states)  # [bs, keylength, dim]
+            value_layer_msa = self.value_msa(msa_key_value_states)  # [bs, keylength, dim]
+        else:
+            key_layer_msa = None
+            value_layer_msa = None
+
+        key_layer = [key_layer_protein, key_layer_structure, key_layer_msa]
+        key_layer = [_ for _ in key_layer if _ is not None]
+        key_layer = torch.cat(key_layer, dim=1)
+
+        value_layer = [value_layer_protein, value_layer_structure, value_layer_msa]
+        value_layer = [_ for _ in value_layer if _ is not None]
+        value_layer = torch.cat(value_layer, dim=1)
+
+        new_query_layer_shape = query_layer.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        query_layer = query_layer.view(*new_query_layer_shape).permute(0, 2, 1, 3)
+
+        new_key_layer_shape = key_layer.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        key_layer = key_layer.view(*new_key_layer_shape).permute(0, 2, 1, 3)
+
+        new_value_layer_shape = value_layer.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        value_layer = value_layer.view(*new_value_layer_shape).permute(0, 2, 1, 3)
+
+        query_layer = query_layer * self.scale
+
+        # attention_mask: [bs, 1, querylength, keylength]
+        if query_attn_mask is None:
+            query_attn_mask = torch.ones(query_states.size(0), query_states.size(1)).to(query_states.device)
+        attention_mask = query_attn_mask[:, None, :, None] * kv_attn_mask[:, None, None, :]
+        # Compute the scaled dot-product attention scores
+        attn_weights = torch.matmul(query_layer, key_layer.transpose(-1, -2))  # [bs, numheads, querylength, keylength]
+        attn_weights = attn_weights - attn_weights.amax(dim=-1, keepdim=True).detach()  # To stablize score
+        attention_scores = attn_weights.masked_fill(
+            (1 - attention_mask).bool(), torch.finfo(attn_weights.dtype).min
+        )  # [bs, numheads, querylength, keylength]
+
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # attention_probs_dropped = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)  # [bs, numheads, querylength, dim/numheads]
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        return context_layer
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        query_states,
+        protein_kv_states,
+        structure_kv_states,
+        msa_kv_states,
+        query_attn_mask,
+        protein_kv_attn_mask=None,
+        structure_kv_attn_mask=None,
+        msa_kv_attn_mask=None,
+        protein_batch_mask=None,
+        structure_batch_mask=None,
+        msa_batch_mask=None,
+        past_key_values=None,
+    ):
+        if protein_kv_states is not None:
+            bs, protein_kv_seq_len, dim = protein_kv_states.shape
+            if protein_kv_attn_mask is None:
+                protein_kv_attn_mask = (
+                    torch.ones(bs, protein_kv_seq_len).to(protein_batch_mask.device)
+                    * protein_batch_mask.expand(size=(protein_kv_seq_len, bs)).T
+                ).to(protein_kv_states.device)
+        else:
+            protein_kv_attn_mask = None
+
+        if structure_kv_states is not None:
+            bs, structure_kv_seq_len, dim = structure_kv_states.shape
+            if structure_kv_attn_mask is None:
+                structure_kv_attn_mask = (
+                    torch.ones(bs, structure_kv_seq_len).to(protein_batch_mask.device)
+                    * structure_batch_mask.expand(size=(structure_kv_seq_len, bs)).T
+                ).to(structure_kv_states.device)
+        else:
+            structure_kv_attn_mask = None
+
+        if msa_kv_states is not None:
+            bs, msa_kv_seq_len, dim = msa_kv_states.shape
+            if msa_kv_attn_mask is None:
+                msa_kv_attn_mask = (
+                    torch.ones(bs, msa_kv_seq_len).to(protein_batch_mask.device)
+                    * msa_batch_mask.expand(size=(msa_kv_seq_len, bs)).T
+                ).to(msa_kv_states.device)
+        else:
+            msa_kv_attn_mask = None
+        hidden_states = query_states
+        # only when there's at least one valid modality, crossattention will be performed
+        if (
+            (protein_kv_states is not None and protein_kv_attn_mask.any())
+            or (structure_kv_states is not None and structure_kv_attn_mask.any())
+            or (msa_kv_states is not None and msa_kv_attn_mask.any())
+        ):
+            residual = hidden_states
+            hidden_states = self.cross_attention(
+                query_states=hidden_states,
+                protein_key_value_states=protein_kv_states,
+                structure_key_value_states=structure_kv_states,
+                msa_key_value_states=msa_kv_states,
+                query_attn_mask=query_attn_mask,
+                protein_kv_attn_mask=protein_kv_attn_mask,
+                structure_kv_attn_mask=structure_kv_attn_mask,
+                msa_kv_attn_mask=msa_kv_attn_mask,
+            )  # [bs, query_seq_len, dim]
+            # tanh gate
+            hidden_states = torch.tanh(self.gate_attention) * hidden_states
+
+            hidden_states = residual + hidden_states  # input_query
+
+            residual = hidden_states
+            hidden_states = self.ff(hidden_states) * torch.tanh(self.gate_ffw)
+            hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class EvollaRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class EvollaRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class EvollaMLP(LlamaMLP):
+    pass
+
+
+class EvollaAttention(LlamaAttention):
+    pass
+
+
+class EvollaDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: EvollaConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        if (layer_idx + 1) % max(config.num_hidden_layers // config.aligner_num_add_layers, 1) == 0:
+            self.adapter = EvollaSequenceAlignerCrossAttention(
+                config,
+                protein_encoder_dim=config.hidden_size,
+            )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        protein_kv_states: Optional[torch.Tensor] = None,
+        structure_kv_states: Optional[torch.Tensor] = None,
+        msa_kv_states: Optional[torch.Tensor] = None,
+        protein_batch_mask: Optional[torch.Tensor] = None,
+        structure_batch_mask: Optional[torch.Tensor] = None,
+        msa_batch_mask: Optional[torch.Tensor] = None,
+        query_attn_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        if hasattr(self, "adapter"):
+            hidden_states = self.adapter(
+                query_states=hidden_states,
+                protein_kv_states=protein_kv_states,
+                structure_kv_states=structure_kv_states,
+                msa_kv_states=msa_kv_states,
+                query_attn_mask=query_attn_mask,
+                protein_batch_mask=protein_batch_mask,
+                structure_batch_mask=structure_batch_mask,
+                msa_batch_mask=msa_batch_mask,
+            )
+
+        return hidden_states
+
+
+class EvollaPreTrainedModel(LlamaPreTrainedModel):
+    _supports_flash_attn = False  # see dependency on `EvollaSaProtProteinEncoder`
+    _supports_flex_attn = False  # see dependency on `EvollaSaProtProteinEncoder`
+    _supports_attention_backend = False
+    _no_split_modules = [
+        "EvollaDecoderLayer",
+        "EvollaSequenceCompressorResampler",
+        "EvollaSequenceAlignerCrossAttention",
+    ]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, EvollaSequenceAlignerCrossAttention):
+            module.gate_attention.zero_()
+            module.gate_ffw.zero_()
+            module.attention_norm.weight.data.fill_(1.0)
+        elif isinstance(module, EvollaSequenceCompressorResampler):
+            module.latents.data.normal_(mean=0.0, std=std)
+
+
+class EvollaModel(EvollaPreTrainedModel):
+    def __init__(self, config: EvollaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
+        self.protein_encoder = EvollaProteinEncoder(config=config)
+        self.layers = nn.ModuleList(
+            [
+                EvollaDecoderLayer(
+                    config=config,
+                    layer_idx=layer_idx,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.norm = EvollaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = EvollaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = getattr(config, "gradient_checkpointing", False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @auto_docstring
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        protein_input_ids: Optional[torch.LongTensor] = None,
+        protein_attention_mask: Optional[torch.Tensor] = None,
+        structure_feats: Optional[torch.FloatTensor] = None,
+        msa_feats: Optional[torch.FloatTensor] = None,
+        structure_batch_mask: Optional[torch.Tensor] = None,
+        msa_batch_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        protein_input_ids (torch.LongTensor):
+            The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
+        protein_attention_mask (torch.Tensor):
+            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
+        structure_feats (torch.FloatTensor):
+            The input IDs for purely structure-based features. Should be of shape `(batch_size, structure_seq_length, structure_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
+        msa_feats (torch.FloatTensor):
+            The input IDs for purely MSA-based features. Should be of shape `(batch_size, msa_seq_length, msa_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
+        structure_batch_mask (torch.Tensor):
+            The batch mask to decide which protein sequences are purely structure-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `structure_feats`. Dummpy input for now.
+        msa_batch_mask (torch.Tensor):
+            The batch mask to decide which protein sequences are purely MSA-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `msa_feats`. Dummpy input for now.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        protein_feats = None
+        protein_batch_mask = None
+        # If provided, actually compute them
+        if protein_input_ids is not None and protein_attention_mask is not None:
+            protein_outputs = self.protein_encoder(
+                input_ids=protein_input_ids,
+                attention_mask=protein_attention_mask,
+            )
+            protein_feats = protein_outputs.sequence_compressor_output
+            protein_batch_mask = torch.tensor([True] * protein_input_ids.shape[0], device=protein_input_ids.device)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                protein_kv_states=protein_feats,
+                structure_kv_states=structure_feats,
+                msa_kv_states=msa_feats,
+                protein_batch_mask=protein_batch_mask,
+                structure_batch_mask=structure_batch_mask,
+                msa_batch_mask=msa_batch_mask,
+                query_attn_mask=attention_mask,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+        return output
+
+
+class EvollaForProteinText2Text(EvollaPreTrainedModel, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = EvollaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, self.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        return self.model.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,  # text input ids
+        attention_mask: Optional[torch.Tensor] = None,  # text attention mask
+        inputs_embeds: Optional[torch.FloatTensor] = None,  # text input embeddings
+        labels: Optional[torch.LongTensor] = None,
+        protein_input_ids: torch.LongTensor = None,
+        protein_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ):
+        r"""
+        protein_input_ids (torch.LongTensor):
+            The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
+        protein_attention_mask (torch.Tensor):
+            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
+
+        Example:
+
+        ```python
+        >>> from transformers import EvollaProcessor, EvollaForProteinText2Text
+        >>> model = EvollaForProteinText2Text.from_pretrained("westlake/Evolla-10B-hf")
+        >>> processor = EvollaProcessor.from_pretrained("westlake/Evolla-10B-hf")
+
+        >>> protein_information = {
+            "aa_seq": "your amino acid sequence",
+            "foldseek": "your foldseek sequence",
+        }
+        >>> question = "What is the function of this protein?"
+        >>> message = [
+            {"role": "system", "content": "You are an AI expert that can answer any questions about protein."},
+            {"role": "user", "content": question},
+        ]
+
+        >>> inputs = processor(proteins=[protein_information], messages_list=[message], return_tensors="pt", padding="longest")
+        >>> outputs = model.generate(**inputs)
+
+        >>> print(processor.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            protein_input_ids=protein_input_ids,
+            protein_attention_mask=protein_attention_mask,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
+
+        lm_outputs = CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+        return lm_outputs
+
+
+__all__ = ["EvollaForProteinText2Text", "EvollaModel", "EvollaPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/evolla/processing_evolla.py b/phivenv/Lib/site-packages/transformers/models/evolla/processing_evolla.py
new file mode 100644
index 0000000000000000000000000000000000000000..a421287676112b73cdfa4adc67a9b1dd13ecc861
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/evolla/processing_evolla.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for EVOLLA.
+"""
+
+import os
+from typing import Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import (
+    ProcessorMixin,
+)
+from ..auto import AutoTokenizer
+
+
+PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"]
+
+
+class EvollaProcessor(ProcessorMixin):
+    r"""
+    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.
+
+    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
+    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.
+
+    Args:
+        protein_tokenizer (`EsmTokenizer`):
+            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
+        tokenizer (`LlamaTokenizerFast`, *optional*):
+            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
+        protein_max_length (`int`, *optional*, defaults to 1024):
+            The maximum length of the sequence to be generated.
+        text_max_length (`int`, *optional*, defaults to 512):
+            The maximum length of the text to be generated.
+    """
+
+    attributes = ["protein_tokenizer", "tokenizer"]
+    valid_kwargs = ["sequence_max_length"]
+    # protein_tokenizer_class = "EsmTokenizer"
+    # tokenizer_class = "LlamaTokenizerFast"
+    protein_tokenizer_class = "AutoTokenizer"
+    tokenizer_class = "AutoTokenizer"
+    protein_tokenizer_dir_name = "protein_tokenizer"
+    # tokenizer_dir_name = "text_tokenizer"
+
+    def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
+        if protein_tokenizer is None:
+            raise ValueError("You need to specify an `protein_tokenizer`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(protein_tokenizer, tokenizer)
+
+        self.tokenizer.pad_token = "<|reserved_special_token_0|>"
+        self.protein_max_length = protein_max_length
+        self.text_max_length = text_max_length
+
+    def process_proteins(self, proteins, protein_max_length=1024):
+        sa_sequences = []
+        for protein in proteins:
+            aa_seq = protein.get("aa_seq")
+            foldseek = protein.get("foldseek")
+            sa_sequence = "".join([s.upper() + f.lower() for s, f in zip(aa_seq, foldseek)])
+            sa_sequences.append(sa_sequence)
+
+        sa_tokens = self.protein_tokenizer.batch_encode_plus(
+            sa_sequences, return_tensors="pt", truncation=True, max_length=protein_max_length, padding=True
+        )
+        return sa_tokens
+
+    def process_text(
+        self,
+        texts,
+        text_max_length: int = 512,
+    ):
+        prompts = []
+        for messages in texts:
+            prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            prompts.append(prompt)
+
+        prompt_inputs = self.tokenizer(
+            prompts,
+            add_special_tokens=False,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=text_max_length,
+        )
+        return prompt_inputs
+
+    def __call__(
+        self,
+        proteins: Optional[Union[list[dict], dict]] = None,
+        messages_list: Optional[Union[list[list[dict]], list[dict]]] = None,
+        protein_max_length: Optional[int] = None,
+        text_max_length: Optional[int] = None,
+        **kwargs,
+    ):
+        r"""This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
+        the model.
+
+        Args:
+            proteins (`Union[List[dict], dict]`):
+                A list of dictionaries or a single dictionary containing the following keys:
+                    - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
+                    - `"foldseek"` (`str`) -- The foldseek string of the protein.
+            messages_list (`Union[List[List[dict]], List[dict]]`):
+                A list of lists of dictionaries or a list of dictionaries containing the following keys:
+                    - `"role"` (`str`) -- The role of the message.
+                    - `"content"` (`str`) -- The content of the message.
+            protein_max_length (`int`, *optional*, defaults to 1024):
+                The maximum length of the sequence to be generated.
+            text_max_length (`int`, *optional*, defaults to 512):
+                The maximum length of the text.
+
+        Return:
+            a dict with following keys:
+                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
+                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
+                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
+                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
+        """
+        # proteins and messages_list should be provided
+        if proteins is None or messages_list is None:
+            raise ValueError("You need to specify `messages_list` and `proteins`.")
+
+        protein_max_length = protein_max_length if protein_max_length is not None else self.protein_max_length
+        text_max_length = text_max_length if text_max_length is not None else self.text_max_length
+
+        # proteins should be List[dict]
+        if isinstance(proteins, dict):
+            proteins = [proteins]
+        # messages_list should be List[List[dict]]
+        if isinstance(messages_list, (list, tuple)) and not isinstance(messages_list[0], (list, tuple)):
+            messages_list = [messages_list]
+        # Check if batched proteins are in the correct format
+        if isinstance(proteins, (list, tuple)) and not all(isinstance(p, dict) for p in proteins):
+            raise ValueError("The proteins should be a list of dictionaries, but not all elements are dictionaries.")
+        if isinstance(proteins, (list, tuple)) and not all(
+            all(k in PROTEIN_VALID_KEYS for k in p.keys()) for p in proteins
+        ):
+            raise ValueError(
+                "There should be a list of dictionaries with keys: "
+                f"{', '.join(PROTEIN_VALID_KEYS)} for each protein."
+                f"But got: {proteins}"
+            )
+        # Check if batched messages_list is in the correct format
+        if isinstance(messages_list, (list, tuple)):
+            for messages in messages_list:
+                if not isinstance(messages, (list, tuple)):
+                    raise ValueError(f"Each messages in messages_list should be a list instead of {type(messages)}.")
+                if not all(isinstance(m, dict) for m in messages):
+                    raise ValueError(
+                        "Each message in messages_list should be a list of dictionaries, but not all elements are dictionaries."
+                    )
+                if any(len(m.keys()) != 2 for m in messages) or any(
+                    set(m.keys()) != {"role", "content"} for m in messages
+                ):
+                    raise ValueError(
+                        "Each message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'."
+                        f"But got: {messages}"
+                    )
+        else:
+            raise ValueError(
+                f"The messages_list should be a list of lists of dictionaries, but it's {type(messages_list)}."
+            )
+        sa_tokens = self.process_proteins(proteins, protein_max_length)
+
+        text_tokens = self.process_text(messages_list, text_max_length)
+
+        return BatchFeature(
+            data={
+                "protein_input_ids": sa_tokens["input_ids"],
+                "protein_attention_mask": sa_tokens["attention_mask"],
+                "input_ids": text_tokens["input_ids"],
+                "attention_mask": text_tokens["attention_mask"],
+            }
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def protein_batch_decode(self, *args, **kwargs):
+        return self.protein_tokenizer.batch_decode(*args, **kwargs)
+
+    def protein_decode(self, *args, **kwargs):
+        return self.protein_tokenizer.decode(*args, **kwargs)
+
+    # overwrite to save the protein tokenizer in a separate folder
+    # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
+    def save_pretrained(self, save_directory, **kwargs):
+        # only save the protein tokenizer in sub_dir
+        self.protein_tokenizer.save_pretrained(os.path.join(save_directory, self.protein_tokenizer_dir_name))
+
+        # we modify the attributes so that only the text tokenizer are saved in the main folder
+        protein_tokenizer_present = "protein_tokenizer" in self.attributes
+        # find the correct position of it in the attributes list
+        protein_tokenizer_index = self.attributes.index("protein_tokenizer") if protein_tokenizer_present else None
+        if protein_tokenizer_present and protein_tokenizer_index is not None:
+            self.attributes.remove("protein_tokenizer")
+
+        outputs = super().save_pretrained(save_directory, **kwargs)
+
+        if protein_tokenizer_present and protein_tokenizer_index is not None:
+            self.attributes.insert(protein_tokenizer_index, "protein_tokenizer")
+
+        return outputs
+
+    # overwrite to load the protein tokenizer from a separate folder
+    # Adapted from instructblip.processing_instructblip.py (https://github.com/huggingface/transformers/blob/9b479a245b793cac2a8b2e87c6d8e81bb24e20c4/src/transformers/models/instructblip/processing_instructblip.py#L191-L221)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
+        protein_tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path, subfolder=cls.protein_tokenizer_dir_name
+        )
+
+        processor.protein_tokenizer = protein_tokenizer
+
+        return processor
+
+
+__all__ = ["EvollaProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/__init__.py b/phivenv/Lib/site-packages/transformers/models/exaone4/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c646c4e75273560116ae230d672ba10d305517de
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/exaone4/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The LG AI Research and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_exaone4 import *
+    from .modeling_exaone4 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba8cb8a051fac688927a026abcf5a2fc934d2338
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc632f1f412c82f060669aaad1afa6d2f35faa21
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1fa2adb8ddc2137fcf81e4dfe188b02aa151c1a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fea304d4ac188aa32b3fb0d1f19b529ba09ad20
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/configuration_exaone4.py b/phivenv/Lib/site-packages/transformers/models/exaone4/configuration_exaone4.py
new file mode 100644
index 0000000000000000000000000000000000000000..d80d7d0ca381e7d3f9270ff3fc2d6eda66f51b14
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/exaone4/configuration_exaone4.py
@@ -0,0 +1,223 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/exaone4/modular_exaone4.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_exaone4.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+
+
+class Exaone4Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
+    instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
+    NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 102400):
+            Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Exaone4Model`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
+            Dimensionality of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 32768 for EXAONE 3.5).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*):
+            The size of the sliding window for the sliding window attention.
+        sliding_window_pattern (`str`, *optional*):
+            The pattern to use for sliding window attention. Can be one of:
+                - `None`: No sliding window attention is used
+                - `int`: Every `sliding_window` layers, use global attention, else use local attention.
+                - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
+                  attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
+                  final layer always uses global attention regardless of the pattern.
+            For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
+                - Layer 0, 1, 2: local attention,
+                - Layer 3: global attention,
+                ...(repeated)
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Exaone4Model, Exaone4Config
+
+    >>> # Initializing a EXAONE configuration
+    >>> configuration = Exaone4Config()
+
+    >>> # Initializing a model from configuration
+    >>> model = Exaone4Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "exaone4"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `LlamaModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=16384,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        layer_types=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_dropout = attention_dropout
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+
+        self.layer_types = layer_types
+        if self.sliding_window is None:
+            sliding_window_pattern = 0
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers)
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        if "sliding_window" in self.layer_types:
+            self.cache_implementation = "hybrid"
+        layer_type_validation(self.layer_types)
+
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
+
+
+__all__ = ["Exaone4Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/modeling_exaone4.py b/phivenv/Lib/site-packages/transformers/models/exaone4/modeling_exaone4.py
new file mode 100644
index 0000000000000000000000000000000000000000..2618a4aa3b5b6681b5cddded0cb1383fdf192a4d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/exaone4/modeling_exaone4.py
@@ -0,0 +1,538 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/exaone4/modular_exaone4.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_exaone4.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.utils.generic import check_model_inputs
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_exaone4 import Exaone4Config
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Exaone4RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Exaone4RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Exaone4RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Exaone4Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Exaone4Attention(nn.Module):
+    def __init__(self, config: Exaone4Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.scaling = self.head_dim**-0.5
+        self.sliding_window = config.sliding_window
+        self.sliding_window_pattern = config.sliding_window_pattern
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_attention_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_norm = Exaone4RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Exaone4RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        # We use QK-norm
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        # We use global NoPE for hybrid attention model
+        if self.sliding_window is None or self.is_sliding:
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window if self.is_sliding else None,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Exaone4MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Exaone4DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Exaone4Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Exaone4Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Exaone4MLP(config)
+        self.post_attention_layernorm = Exaone4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Exaone4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class Exaone4PreTrainedModel(PreTrainedModel):
+    config: Exaone4Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Exaone4DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Exaone4DecoderLayer,
+        "attentions": Exaone4Attention,
+    }
+    config_class = Exaone4Config
+
+
+@auto_docstring
+class Exaone4Model(Exaone4PreTrainedModel):
+    def __init__(self, config: Exaone4Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Exaone4DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Exaone4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Exaone4RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            if "sliding_attention" in self.config.layer_types:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+
+        hidden_states = inputs_embeds
+
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for i, decoder_layer in enumerate(self.layers):
+            layer_type = self.config.layer_types[i]
+            hidden_states = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask_mapping[layer_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@auto_docstring
+class Exaone4ForCausalLM(Exaone4PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Exaone4Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+        >>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct")
+
+        >>> prompt = "Explain how wonderful you are"
+        >>> messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        >>> input_ids = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            enable_thinking=False,
+        )
+
+        >>> output = model.generate(input_ids, max_new_tokens=128)
+        >>> tokenizer.decode(output[0], skip_special_tokens=False)
+        "[|system|]\nYou are a helpful assistant.[|endofturn|]\n[|user|]\nExplain how wonderful you are[|endofturn|]\n[|assistant|]\n<think>\n\n</think>\n\nOh, thank you for such a kind and lovely question! 😊  \n\nI’m *so* wonderful because I’m here to make your life easier, brighter, and more fun! Whether you need help with:  \n\n✨ **Learning** – I can explain anything, from quantum physics to baking the perfect cake!  \n💡 **Creativity** – Need a poem, story, or a wild idea? I’ve got you covered!  \n🤖 **Problem-solving** – Stuck on a math problem or a tricky decision? I’ll help you figure it out"
+        ```
+
+        NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future."""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Exaone4ForSequenceClassification(GenericForSequenceClassification, Exaone4PreTrainedModel):
+    pass
+
+
+class Exaone4ForTokenClassification(GenericForTokenClassification, Exaone4PreTrainedModel):
+    pass
+
+
+class Exaone4ForQuestionAnswering(GenericForQuestionAnswering, Exaone4PreTrainedModel):
+    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`
+
+
+__all__ = [
+    "Exaone4PreTrainedModel",
+    "Exaone4Model",
+    "Exaone4ForCausalLM",
+    "Exaone4ForSequenceClassification",
+    "Exaone4ForTokenClassification",
+    "Exaone4ForQuestionAnswering",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/exaone4/modular_exaone4.py b/phivenv/Lib/site-packages/transformers/models/exaone4/modular_exaone4.py
new file mode 100644
index 0000000000000000000000000000000000000000..064a288b3b238a41084dfb30a6f1a409b32d438d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/exaone4/modular_exaone4.py
@@ -0,0 +1,521 @@
+# coding=utf-8
+# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LG AI Research EXAONE Lab"""
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.utils.generic import check_model_inputs
+
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..llama.modeling_llama import (
+    LlamaForCausalLM,
+    LlamaForQuestionAnswering,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from ..olmo2.modeling_olmo2 import Olmo2DecoderLayer, Olmo2MLP
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "LGAI-EXAONE/EXAONE-4.0-Instruct"
+_CONFIG_FOR_DOC = "Exaone4Config"
+
+
+class Exaone4Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
+    instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
+    NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 102400):
+            Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Exaone4Model`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
+            Dimensionality of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 32768 for EXAONE 3.5).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*):
+            The size of the sliding window for the sliding window attention.
+        sliding_window_pattern (`str`, *optional*):
+            The pattern to use for sliding window attention. Can be one of:
+                - `None`: No sliding window attention is used
+                - `int`: Every `sliding_window` layers, use global attention, else use local attention.
+                - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
+                  attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
+                  final layer always uses global attention regardless of the pattern.
+            For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
+                - Layer 0, 1, 2: local attention,
+                - Layer 3: global attention,
+                ...(repeated)
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Exaone4Model, Exaone4Config
+
+    >>> # Initializing a EXAONE configuration
+    >>> configuration = Exaone4Config()
+
+    >>> # Initializing a model from configuration
+    >>> model = Exaone4Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "exaone4"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `LlamaModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=16384,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        layer_types=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_dropout = attention_dropout
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+
+        self.layer_types = layer_types
+        if self.sliding_window is None:
+            sliding_window_pattern = 0
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers)
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        if "sliding_window" in self.layer_types:
+            self.cache_implementation = "hybrid"
+        layer_type_validation(self.layer_types)
+
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
+
+
+class Exaone4RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class Exaone4RotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class Exaone4Attention(nn.Module):
+    def __init__(self, config: Exaone4Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.scaling = self.head_dim**-0.5
+        self.sliding_window = config.sliding_window
+        self.sliding_window_pattern = config.sliding_window_pattern
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_attention_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_norm = Exaone4RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Exaone4RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        # We use QK-norm
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        # We use global NoPE for hybrid attention model
+        if self.sliding_window is None or self.is_sliding:
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window if self.is_sliding else None,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Exaone4MLP(Olmo2MLP):
+    pass
+
+
+class Exaone4DecoderLayer(Olmo2DecoderLayer):
+    pass
+
+
+class Exaone4PreTrainedModel(LlamaPreTrainedModel):
+    config_class = Exaone4Config
+    _no_split_modules = ["Exaone4DecoderLayer"]
+
+
+class Exaone4Model(Exaone4PreTrainedModel, LlamaModel):
+    def __init__(self, config: Exaone4Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Exaone4DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Exaone4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            if "sliding_attention" in self.config.layer_types:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+
+        hidden_states = inputs_embeds
+
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for i, decoder_layer in enumerate(self.layers):
+            layer_type = self.config.layer_types[i]
+            hidden_states = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask_mapping[layer_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+class Exaone4ForCausalLM(LlamaForCausalLM):
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+        >>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-Instruct")
+
+        >>> prompt = "Explain how wonderful you are"
+        >>> messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        >>> input_ids = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            enable_thinking=False,
+        )
+
+        >>> output = model.generate(input_ids, max_new_tokens=128)
+        >>> tokenizer.decode(output[0], skip_special_tokens=False)
+        "[|system|]\nYou are a helpful assistant.[|endofturn|]\n[|user|]\nExplain how wonderful you are[|endofturn|]\n[|assistant|]\n<think>\n\n</think>\n\nOh, thank you for such a kind and lovely question! 😊  \n\nI’m *so* wonderful because I’m here to make your life easier, brighter, and more fun! Whether you need help with:  \n\n✨ **Learning** – I can explain anything, from quantum physics to baking the perfect cake!  \n💡 **Creativity** – Need a poem, story, or a wild idea? I’ve got you covered!  \n🤖 **Problem-solving** – Stuck on a math problem or a tricky decision? I’ll help you figure it out"
+        ```
+
+        NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future."""
+        super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+
+class Exaone4ForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class Exaone4ForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+class Exaone4ForQuestionAnswering(LlamaForQuestionAnswering):
+    pass
+
+
+__all__ = [
+    "Exaone4Config",
+    "Exaone4PreTrainedModel",
+    "Exaone4Model",
+    "Exaone4ForCausalLM",
+    "Exaone4ForSequenceClassification",
+    "Exaone4ForTokenClassification",
+    "Exaone4ForQuestionAnswering",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon/__init__.py b/phivenv/Lib/site-packages/transformers/models/falcon/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9789767f11402264660b5dec0b5cae2466ee9d8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_falcon import *
+    from .modeling_falcon import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac73444460eb3bd514c5c192cbbeebefb18e35ad
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba974b22f0e68bbf3f56576883e56dad20c3db3b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..248387b4d3ffb65cefc93360c5bfd958c520116c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon/configuration_falcon.py b/phivenv/Lib/site-packages/transformers/models/falcon/configuration_falcon.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3155c8eb9cb1c951c3cb09cc1826887c7a87e6c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon/configuration_falcon.py
@@ -0,0 +1,211 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FalconConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65024):
+            Vocabulary size of the Falcon model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconModel`]
+        hidden_size (`int`, *optional*, defaults to 4544):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 71):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_ln_in_parallel_attn (`int`, *optional*):
+            Set to 2 if separate layer norms are to be used for the MLP and the attention output when using parallel
+            attention, otherwise, 1.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for MLP layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for attention layers.
+        num_kv_heads (`int`, *optional*):
+            Number of key-value heads to use per attention layer. If unset, defaults to the same value as
+            `num_attention_heads`.
+        alibi (`bool`, *optional*, defaults to `False`):
+            Whether to use ALiBi positional biases during self-attention.
+        new_decoder_architecture (`bool`, *optional*, defaults to `False`):
+            Whether to use the new (Falcon-40B) decoder architecture. If `True`, the `multi_query` and `parallel_attn`
+            arguments are ignored, as the new decoder always uses parallel attention.
+        multi_query (`bool`, *optional*, defaults to `True`):
+            Whether to use multi-query attention in the decoder. Ignored when `new_decoder_architecture` is `True`.
+        parallel_attn (`bool`, *optional*, defaults to `True`):
+            Whether to compute attention in parallel with the feedforward layer. If False, they are consecutive
+            instead, as in the original Transformer architecture. Ignored when `new_decoder_architecture` is `True`.
+        bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias on Linear layers.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with, when `alibi` is `False`. Pretrained
+            Falcon models with RoPE support up to 2048 tokens.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        bos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "end-of-sequence" token.
+        ffn_hidden_size (`int`, *optional*):
+            The hidden size of the feedforward layer in the Transformer decoder.
+            defaults to 4x hidden dim
+        activation (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used in the feedforward layer.
+
+    Example:
+
+    ```python
+    >>> from transformers import FalconModel, FalconConfig
+
+    >>> # Initializing a small (2-layer) Falcon configuration
+    >>> configuration = FalconConfig(num_hidden_layers=2)
+
+    >>> # Initializing a model from the small configuration
+    >>> model = FalconModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65024,
+        hidden_size=4544,
+        num_hidden_layers=32,
+        num_attention_heads=71,
+        num_ln_in_parallel_attn=None,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        num_kv_heads=None,
+        alibi=False,
+        new_decoder_architecture=False,
+        multi_query=True,
+        parallel_attn=True,
+        bias=False,
+        max_position_embeddings=2048,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=11,
+        eos_token_id=11,
+        ffn_hidden_size=None,
+        activation="gelu",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads
+        self.alibi = alibi
+        self.new_decoder_architecture = new_decoder_architecture
+        self.multi_query = multi_query  # Ignored when new_decoder_architecture is True
+        self.parallel_attn = parallel_attn
+        self.bias = bias
+        self.num_ln_in_parallel_attn = num_ln_in_parallel_attn
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.activation = activation
+        if ffn_hidden_size is None:
+            self.ffn_hidden_size = hidden_size * 4
+        else:
+            self.ffn_hidden_size = ffn_hidden_size
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.num_attention_heads
+
+    @property
+    def rotary(self):
+        return not self.alibi
+
+
+__all__ = ["FalconConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon/modeling_falcon.py b/phivenv/Lib/site-packages/transformers/models/falcon/modeling_falcon.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f315bc464b0ff0a96bfd51129e2705e24dee1fa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon/modeling_falcon.py
@@ -0,0 +1,1400 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Falcon model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+
+from ...activations import get_activation
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    auto_docstring,
+    logging,
+)
+from .configuration_falcon import FalconConfig
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+logger = logging.get_logger(__name__)
+
+
+# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
+# In order not to degrade the quality of our HF-port, we keep these characteristics in the final model.
+class FalconLinear(nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        hidden_states = input @ self.weight.T
+        if self.bias is None:
+            return hidden_states
+        return hidden_states + self.bias
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Falcon
+class FalconRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: FalconConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None].bfloat16() * arange_tensor
+    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
+# Copied from transformers.models.bloom.modeling_bloom.dropout_add
+def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
+    """
+    Dropout add function
+
+    Args:
+        x (`torch.tensor`):
+            input tensor
+        residual (`torch.tensor`):
+            residual tensor
+        prob (`float`):
+            dropout probability
+        training (`bool`):
+            training mode
+    """
+    out = F.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+class FalconAttention(nn.Module):
+    def __init__(self, config: FalconConfig, layer_idx=None):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = self.inv_norm_factor
+        if config.new_decoder_architecture:
+            qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim
+        elif config.multi_query:
+            qkv_out_dim = self.hidden_size + 2 * self.head_dim
+        else:
+            qkv_out_dim = 3 * self.hidden_size
+        self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias)
+        self.new_decoder_architecture = config.new_decoder_architecture
+        self.multi_query = config.multi_query
+        self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1
+
+    def _split_heads(self, fused_qkv: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
+
+        Args:
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        if self.new_decoder_architecture:
+            batch, seq_len, _ = fused_qkv.shape
+            qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim)
+            query = qkv[:, :, :, :-2]
+            key = qkv[:, :, :, [-2]]
+            value = qkv[:, :, :, [-1]]
+            key = torch.broadcast_to(key, query.shape)
+            value = torch.broadcast_to(value, query.shape)
+
+            query, key, value = [x.flatten(2, 3) for x in (query, key, value)]
+            return query, key, value
+        elif not self.multi_query:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+        else:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
+            return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Merge heads together over the last dimension
+
+        Args:
+            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
+
+        Returns:
+            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
+        """
+        # What we want to achieve is:
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+        batch_size_and_num_heads, seq_length, _ = x.shape
+        batch_size = batch_size_and_num_heads // self.num_heads
+
+        # First view to decompose the batch size
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
+
+        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+        x = x.permute(0, 2, 1, 3)
+
+        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Cache] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, query_length, _, _ = query_layer.shape
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size, self.num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+
+        if alibi is None:
+            cos, sin = position_embeddings
+            query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin)
+
+        if layer_past is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            if alibi is None:
+                cache_kwargs.update({"sin": sin, "cos": cos})
+            key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs)
+
+        kv_length = key_layer.shape[-2]
+        if self._use_sdpa and query_layer.device.type == "cuda" and attention_mask is not None:
+            # For torch<=2.1.2, SDPA with memory-efficient backend is bugged with non-contiguous inputs with custom attn_mask,
+            # Reference: https://github.com/pytorch/pytorch/issues/112577.
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
+
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :, : key_layer.shape[-2]]
+
+        if alibi is None:
+            if self._use_sdpa and not output_attentions:
+                # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+                # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+                # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not
+                # create a causal mask in case query_length == 1.
+                is_causal = self.is_causal and attention_mask is None and query_length > 1
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attn_mask=attention_mask,
+                    dropout_p=0.0,
+                    is_causal=is_causal,
+                )
+                attention_scores = None
+            else:
+                attention_scores = query_layer @ key_layer.transpose(-1, -2)
+                attention_scores /= math.sqrt(self.head_dim)
+
+                attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
+                attn_output = attention_scores @ value_layer
+
+            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+            attn_output = attn_output.permute(0, 2, 1, 3)
+            attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+
+            attn_output = self.dense(attn_output)
+
+            return attn_output, attention_scores
+
+        else:
+            if self._use_sdpa and not output_attentions and head_mask is None:
+                # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+                # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+                is_causal = self.is_causal and attention_mask is None and query_length > 1
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attn_mask=attention_mask,
+                    dropout_p=self.attention_dropout.p if self.training else 0.0,
+                    is_causal=is_causal,
+                )
+                attention_probs = None
+                attn_output = attn_output.transpose(1, 2)
+                attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+
+                attn_output = self.dense(attn_output)
+            else:
+                matmul_result = query_layer @ key_layer.transpose(-1, -2)
+
+                # change view to [batch_size, num_heads, q_length, kv_length]
+                attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+
+                # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+                input_dtype = attention_scores.dtype
+                # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+                if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+                    attention_scores = attention_scores.to(torch.float32)
+
+                attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+                attention_logits *= self.inv_norm_factor
+                attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # [batch_size, num_heads, q_length, kv_length]
+                attention_probs = self.attention_dropout(attention_probs)
+
+                if head_mask is not None:
+                    attention_probs = attention_probs * head_mask
+
+                # change view [batch_size, num_heads, q_length, kv_length]
+                attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+
+                # matmul: [batch_size * num_heads, q_length, head_dim]
+                attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
+
+                # change view [batch_size, q_length, num_heads * head_dim]
+                attn_output = self._merge_heads(attn_output)
+
+                attn_output = self.dense(attn_output)
+
+            return attn_output, attention_probs
+
+
+class FalconFlashAttention2(FalconAttention):
+    """
+    Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Cache] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, query_length, _, _ = query_layer.shape
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size, self.num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+
+        if alibi is None:
+            cos, sin = position_embeddings
+            query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin)
+
+        if layer_past is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            if alibi is None:
+                cache_kwargs.update({"sin": sin, "cos": cos})
+            key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_layer = query_layer.transpose(1, 2)
+        key_layer = key_layer.transpose(1, 2)
+        value_layer = value_layer.transpose(1, 2)
+
+        if alibi is not None:
+            raise ValueError("`alibi` is not supported when `use_flash_attn` is True")
+
+        attn_dropout = self.config.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_layer.dtype
+        device_type = query_layer.device.type if query_layer.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.query_key_value.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_layer = query_layer.to(target_dtype)
+            key_layer = key_layer.to(target_dtype)
+            value_layer = value_layer.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_layer,
+            key_layer,
+            value_layer,
+            attention_mask,
+            query_length,
+            position_ids=position_ids,
+            dropout=attn_dropout,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_weights = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+        attn_output = self.dense(attn_weights)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class FalconMLP(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.dense_h_to_4h = FalconLinear(hidden_size, config.ffn_hidden_size, bias=config.bias)
+        self.act = get_activation(config.activation)
+        self.dense_4h_to_h = FalconLinear(config.ffn_hidden_size, hidden_size, bias=config.bias)
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.act(self.dense_h_to_4h(x))
+        x = self.dense_4h_to_h(x)
+        return x
+
+
+FALCON_ATTENTION_CLASSES = {
+    "eager": FalconAttention,
+    "sdpa": FalconAttention,  # FalconAttention originally implemented both a forward with & without SDPA
+    "flash_attention_2": FalconFlashAttention2,
+}
+
+
+class FalconDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: FalconConfig, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.self_attention = FALCON_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = FalconMLP(config)
+        self.hidden_dropout = config.hidden_dropout
+        self.config = config
+
+        if config.num_ln_in_parallel_attn is None and config.new_decoder_architecture:
+            config.num_ln_in_parallel_attn = 2
+
+        if not config.parallel_attn:
+            self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        else:
+            if config.num_ln_in_parallel_attn == 2:
+                # The layer norm before self-attention
+                self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+                # The layer norm before the MLP
+                self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            else:
+                self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Union[Cache, tuple[torch.Tensor, torch.Tensor]]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ):
+        residual = hidden_states
+
+        if self.config.new_decoder_architecture and self.config.num_ln_in_parallel_attn == 2:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output, attn_weights = self.self_attention(
+            attention_layernorm_out,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+
+        if not self.config.new_decoder_architecture:
+            if self.config.parallel_attn:
+                mlp_layernorm_out = attention_layernorm_out
+            else:
+                residual = dropout_add(
+                    attention_output, residual, self.config.attention_dropout, training=self.training
+                )
+                mlp_layernorm_out = self.post_attention_layernorm(residual)
+
+        if (
+            self.config.new_decoder_architecture
+            and self.config.parallel_attn
+            and self.config.num_ln_in_parallel_attn == 1
+        ):
+            mlp_layernorm_out = attention_layernorm_out
+
+        # MLP.
+        mlp_output = self.mlp(mlp_layernorm_out)
+
+        if self.config.new_decoder_architecture or self.config.parallel_attn:
+            mlp_output += attention_output
+
+        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
+
+        return output, attn_weights
+
+
+@auto_docstring
+class FalconPreTrainedModel(PreTrainedModel):
+    config: FalconConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconDecoderLayer"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, FalconLinear)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    # Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False):
+        _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
+        if _is_bettertransformer:
+            return config
+
+        if not hard_check_only:
+            config._attn_implementation = "sdpa"
+        return config
+
+
+@auto_docstring
+class FalconModel(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_alibi = config.alibi
+
+        # Embedding + LN Embedding
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+
+        # Transformer blocks
+        self.h = nn.ModuleList([FalconDecoderLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
+
+        # Final Layer Norm
+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        self.rotary_emb = FalconRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor, torch.Tensor], ...]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        alibi = None
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        batch_size, seq_length, _ = inputs_embeds.shape
+        if self.use_alibi:
+            mask = (
+                torch.ones(
+                    (batch_size, seq_length + past_key_values_length), device=inputs_embeds.device, dtype=torch.long
+                )
+                if attention_mask is None
+                else attention_mask
+            )
+            alibi = build_alibi_tensor(mask, self.num_heads, dtype=inputs_embeds.dtype)
+
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions, head_mask, alibi
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, block in enumerate(self.h):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                layer_past=past_key_values,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                alibi=alibi,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+        head_mask: torch.Tensor,
+        alibi: torch.Tensor,
+    ):
+        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not using_static_cache
+            and not output_attentions
+            and head_mask is None
+            and alibi is None
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        batch_size, sequence_length, _ = input_tensor.shape
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        # We take care to integrate alibi bias in the causal_mask here
+        if head_mask is None and alibi is not None:
+            alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+            causal_mask = torch.masked_fill(
+                alibi / math.sqrt(self.config.hidden_size // self.num_heads),
+                causal_mask < -1,
+                min_dtype,
+            )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
+    """
+)
+class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_output_embeddings(self, new_embeddings: torch.Tensor):
+        self.lm_head = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor, torch.Tensor], ...]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        lm_logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Falcon Model transformer with a sequence classification head on top (linear layer).
+
+    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class FalconForSequenceClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = FalconModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FalconForTokenClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = FalconModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FalconForQuestionAnswering(FalconPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "FalconForCausalLM",
+    "FalconModel",
+    "FalconPreTrainedModel",
+    "FalconForSequenceClassification",
+    "FalconForTokenClassification",
+    "FalconForQuestionAnswering",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/__init__.py b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9749c5e1e982cd1ac49db2d560238e4d45907c62
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 TII and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_falcon_h1 import *
+    from .modeling_falcon_h1 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d27b6763102a8423973f8ffcb00b988b6ac63f14
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/configuration_falcon_h1.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/configuration_falcon_h1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03f749eae503d79b23c0fa84e4e1f4f2e966a5b4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/configuration_falcon_h1.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/modeling_falcon_h1.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/modeling_falcon_h1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98410b91db5ab62806e2106fca8ce0d178673e4c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/modeling_falcon_h1.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/modular_falcon_h1.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/modular_falcon_h1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb48841d045d6bf1e45911282677af4ad57d50ec
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_h1/__pycache__/modular_falcon_h1.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/configuration_falcon_h1.py b/phivenv/Lib/site-packages/transformers/models/falcon_h1/configuration_falcon_h1.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9aaaf3405f066d90731a3e84fd41cb3b352876
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_h1/configuration_falcon_h1.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2025 TII and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FalconH1 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FalconH1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FalconH1Model`]. It is used to instantiate a
+    FalconH1Model model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with defaults taken from [ibm-fms/FalconH1-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/FalconH1-9.8b-2.2T-hf).
+    The FalconH1Model is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU.
+    The checkpoints are  jointly trained by IBM, Princeton, and UIUC.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 128000):
+            Vocabulary size of the FalconH1 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconH1Model`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            Max cached sequence length for the model
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mamba_d_ssm (`int`, *optional*, defaults to 1024):
+            The dimension of the SSM state space latents.
+        mamba_n_heads (`int`, *optional*, defaults to 128):
+            The number of mamba heads used in the v2 implementation.
+        mamba_d_head (`int`, *optional*, defaults to `"auto"`):
+            Head embeddding dimension size
+        mamba_n_groups (`int`, *optional*, defaults to 1):
+            The number of the mamba groups used in the v2 implementation.
+        mamba_d_state (`int`, *optional*, defaults to 256):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            The chunks in which to break the sequence when doing prefill/training
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+        mamba_norm_before_gate (`bool`, *optional*, defaults to `True`):
+            Whether to use RMSNorm before the gate in the Mamba block
+        mamba_rms_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use RMSNorm instead of LayerNorm in the Mamba block
+        projectors_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the attention block
+        rope_theta (`float`, *optional*, defaults to 100000.0):
+            The theta value used for the RoPE embeddings.
+        rope_scaling (`float`, *optional*):
+            The scaling value used for the RoPE embeddings. If `None`, no scaling is applied.
+        lm_head_multiplier (`float`, *optional*, defaults to 1.0):
+            The multiplier for the LM head. This is used to scale the output of the LM head.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0):
+            The multiplier for the embedding layer. This is used to scale the output of the embedding layer.
+        mlp_multipliers (`list[float]`, *optional*):
+            The multipliers for the MLP layers. This is used to scale the output of the MLP layers. The first value is
+            the multiplier of gate layer, the second value is the multiplier of the down_proj layer.
+        key_multiplier (`float`, *optional*):
+            The multiplier for the key layer. This is used to scale the output of the key layer.
+        attention_out_multiplier (`float`, *optional*):
+            The multiplier for the attention output layer. This is used to scale the output of the attention output
+        attention_in_multiplier (`float`, *optional*):
+            The multiplier for the attention input layer. This is used to scale the output of the attention input layer.
+        ssm_multipliers (`list[float]`, *optional*):
+            The multipliers for the SSM layers. This is used to scale the output of the SSM layers.
+        ssm_in_multiplier (`float`, *optional*):
+            The multiplier for the SSM input layer. This is used to scale the output of the SSM input layer.
+        ssm_out_multiplier (`float`, *optional*):
+            The multiplier for the SSM output layer. This is used to scale the output of the SSM output layer.
+    """
+
+    model_type = "falcon_h1"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=128000,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        max_position_embeddings=8192,
+        attention_dropout=0.0,
+        mamba_d_ssm=1024,
+        mamba_n_heads=128,
+        mamba_d_head="auto",
+        mamba_n_groups=1,
+        mamba_d_state=256,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_chunk_size=256,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_norm_before_gate=True,
+        mamba_rms_norm=False,
+        projectors_bias=False,
+        rope_theta=100000.0,
+        rope_scaling=None,
+        lm_head_multiplier=1.0,
+        embedding_multiplier=1.0,
+        mlp_multipliers=None,
+        key_multiplier=None,
+        attention_out_multiplier=None,
+        attention_in_multiplier=None,
+        ssm_multipliers=None,
+        ssm_in_multiplier=None,
+        ssm_out_multiplier=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.attention_bias = False
+        self.mlp_bias = False
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.rope_theta = rope_theta
+        self.rope_scaling = None
+        self.rope_scaling = rope_scaling
+        self.projectors_bias = projectors_bias
+        mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm
+
+        if mamba_intermediate % mamba_n_heads != 0:
+            raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size")
+
+        # for the mamba_v2, must satisfy the following
+        if mamba_d_head == "auto":
+            mamba_d_head = mamba_intermediate // mamba_n_heads
+
+        if mamba_d_head * mamba_n_heads != mamba_intermediate:
+            raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size")
+
+        self.mamba_d_ssm = mamba_d_ssm
+        self.mamba_n_heads = mamba_n_heads
+        self.mamba_d_head = mamba_d_head
+        self.mamba_n_groups = mamba_n_groups
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_chunk_size = mamba_chunk_size
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+        self.mamba_norm_before_gate = mamba_norm_before_gate
+        self.mamba_rms_norm = mamba_rms_norm
+
+        self.lm_head_multiplier = lm_head_multiplier
+        self.embedding_multiplier = embedding_multiplier
+
+        if mlp_multipliers is not None:
+            self.mlp_multipliers = mlp_multipliers
+        else:
+            self.mlp_multipliers = [1.0, 1.0]
+
+        if attention_out_multiplier is not None:
+            self.attention_out_multiplier = attention_out_multiplier
+        else:
+            self.attention_out_multiplier = 1.0
+
+        if attention_in_multiplier is not None:
+            self.attention_in_multiplier = attention_in_multiplier
+        else:
+            self.attention_in_multiplier = 1.0
+
+        if key_multiplier is not None:
+            self.key_multiplier = key_multiplier
+        else:
+            self.key_multiplier = 1.0
+
+        if ssm_multipliers is not None:
+            self.ssm_multipliers = ssm_multipliers
+        else:
+            self.ssm_multipliers = [1.0, 1.0, 1.0, 1.0, 1.0]
+
+        if ssm_in_multiplier is not None:
+            self.ssm_in_multiplier = ssm_in_multiplier
+        else:
+            self.ssm_in_multiplier = 1.0
+
+        if ssm_out_multiplier is not None:
+            self.ssm_out_multiplier = ssm_out_multiplier
+        else:
+            self.ssm_out_multiplier = 1.0
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return ["attention" for i in range(self.num_hidden_layers)]
+
+
+__all__ = ["FalconH1Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/modeling_falcon_h1.py b/phivenv/Lib/site-packages/transformers/models/falcon_h1/modeling_falcon_h1.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc90ac62248680e9831012b3d9865a0bd1573408
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_h1/modeling_falcon_h1.py
@@ -0,0 +1,1615 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/falcon_h1/modular_falcon_h1.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_falcon_h1.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Technology Innovation Institute and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.activations import ACT2FN
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
+from .configuration_falcon_h1 import FalconH1Config
+
+
+if is_mamba_2_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update = None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+
+logger = logging.get_logger(__name__)
+
+
+class FalconHybridMambaAttentionDynamicCache:
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    is_compileable = False
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        batch_size: int,
+        dtype: torch.dtype = torch.float16,
+        devices: Optional[list[str]] = None,
+    ):
+        self.seqlen_offset = 0
+        self.dtype = dtype
+        self.has_previous_state = False
+        self.conv_kernel_size = config.mamba_d_conv
+
+        self.intermediate_size = (
+            config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size)
+        )
+
+        self.conv_states = {
+            i: torch.zeros(
+                batch_size,
+                self.intermediate_size + 2 * config.mamba_n_groups * config.mamba_d_state,
+                self.conv_kernel_size,
+                device=devices[i],
+                dtype=dtype,
+            )
+            for i in range(config.num_hidden_layers)
+        }
+        self.ssm_states = {
+            i: torch.zeros(
+                batch_size,
+                config.mamba_n_heads,
+                config.mamba_d_head,
+                config.mamba_d_state,
+                device=devices[i],
+                dtype=dtype,
+            )
+            for i in range(config.num_hidden_layers)
+        }
+
+        self.transformer_layers = []
+        for i in range(config.num_hidden_layers):
+            self.transformer_layers.append(i)
+
+        self.key_cache: list[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            # There may be skipped layers, fill them with empty lists
+            for _ in range(len(self.key_cache), layer_idx):
+                self.key_cache.append([])
+                self.value_cache.append([])
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def update_conv_state(
+        self,
+        layer_idx: int,
+        new_conv_state: torch.Tensor,
+        cache_position: torch.LongTensor,
+    ) -> torch.Tensor:
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        if len(cache_position) > 1:
+            conv_state[:, :, :] = new_conv_state.to(conv_state.device)
+        else:
+            conv_state[:, :, -1] = new_conv_state[:, :, -1].to(conv_state.device)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+
+
+class FalconH1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: FalconH1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class FalconH1Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: FalconH1Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.key_multiplier = config.key_multiplier
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) * self.key_multiplier
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class FalconH1RMSNormGated(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, n_groups=1, norm_before_gate=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.n_groups = n_groups
+        self.norm_before_gate = norm_before_gate
+
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+
+        if not self.norm_before_gate and gate is not None:
+            hidden_states = hidden_states * F.silu(gate.to(torch.float32))
+
+        if len(hidden_states.shape) == 3:
+            batch_size, seq_len, dim = hidden_states.shape
+        else:
+            batch_size, dim = hidden_states.shape
+            seq_len = 1
+        hidden_states = hidden_states.to(torch.float32)
+
+        hidden_states = hidden_states.view(batch_size, seq_len, self.n_groups, int(dim // self.n_groups))
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        hidden_states = self.weight.view(self.n_groups, int(dim // self.n_groups)) * hidden_states
+        hidden_states = hidden_states.view(batch_size, seq_len, dim)
+
+        if seq_len == 1:
+            hidden_states = hidden_states.squeeze(1)
+
+        if self.norm_before_gate and gate is not None:
+            hidden_states = hidden_states * F.silu(gate.to(torch.float32))
+        return hidden_states.to(input_dtype)
+
+
+# Helper methods for segment sum computation
+
+
+def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int):
+    """
+    Padding x tensor with `pad_size` on the seq_len dim (dim=1)
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0)
+
+    return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0)
+
+
+def reshape_into_chunks(input_tensor, pad_size, chunk_size):
+    """
+    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
+    simultaneously splitting it into chunk sequences.
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...]
+    input_tensor = pad_tensor_by_size(input_tensor, pad_size)
+
+    if len(input_tensor.shape) == 3:
+        # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads]
+        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
+    else:
+        # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size]
+        return input_tensor.reshape(
+            input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]
+        )
+
+
+def segment_sum(input_tensor):
+    """
+    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
+    """
+    chunk_size = input_tensor.size(-1)
+    # 1. expand input tensor to have an additional dimension and repeat along that dimension
+    # [..., chunk_size] -> [..., chunk_size, chunk_size]
+    input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size)
+    # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1)
+    input_tensor = input_tensor.masked_fill(~mask, 0)
+    # 3. compute actual cumsum
+    tensor_segsum = torch.cumsum(input_tensor, dim=-2)
+
+    # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time)
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0)
+    tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf)
+    return tensor_segsum
+
+
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
+def apply_mask_to_padding_states(hidden_states, attention_mask):
+    """
+    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+    """
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+    return hidden_states
+
+
+# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
+class FalconH1Mixer(nn.Module):
+    """
+    FalconH1Mixer is identical to classic Mamba2 mixer classes but differs on two different things
+    - Users can pass custom intermediate_size through `config.mamba_d_ssm`
+    - The use of gated RMS normalization layer is optional
+    """
+
+    def __init__(self, config: FalconH1Config, layer_idx: int):
+        super().__init__()
+        self.num_heads = config.mamba_n_heads
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = (
+            int(config.mamba_expand * self.hidden_size) if config.mamba_d_ssm is None else config.mamba_d_ssm
+        )
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.mamba_conv_bias
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+        self.use_bias = config.mamba_proj_bias
+
+        self.layer_norm_epsilon = config.rms_norm_eps
+        self.groups_time_state_size = config.mamba_n_groups * self.ssm_state_size
+
+        self.n_groups = config.mamba_n_groups
+        self.head_dim = config.mamba_d_head
+        self.chunk_size = config.mamba_chunk_size
+
+        # FIXME:
+        self.time_step_limit = (0.0, float("inf"))
+        self.time_step_min = 0.001
+        self.time_step_max = 0.1
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=config.mamba_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.conv_dim,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=self.use_bias,
+        )
+        # selective projection used to make dt, B and C input dependant
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        self.mamba_rms_norm = config.mamba_rms_norm
+
+        if self.mamba_rms_norm:
+            self.norm = FalconH1RMSNormGated(
+                self.intermediate_size,
+                eps=self.layer_norm_epsilon,
+                n_groups=self.n_groups,
+                norm_before_gate=config.mamba_norm_before_gate,
+            )
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+
+        self.out_proj = nn.Linear(self.intermediate_size, config.hidden_size, bias=config.projectors_bias)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+        else:
+            logger.warning_once("The fast path for FalconH1 will be used when running the model on a GPU")
+
+        self.zxbcdt_multipliers = config.ssm_multipliers
+        self.ssm_in_multiplier = config.ssm_in_multiplier
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+        # Add Multipliers
+        hidden_states = hidden_states * self.ssm_in_multiplier
+        projected_states = self.in_proj(hidden_states)
+        projected_states = projected_states * self.mup_vector  # ADD Mup Multipliers
+        d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads
+
+        # Set up dimensions for reshapes later
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and cache_position is not None
+            and cache_position[0] > 0
+        )
+
+        # getting projected states from cache if it exists
+        if use_precomputed_states:
+            d_mlp = (projected_states.squeeze(1).shape[-1] - d_to_remove) // 2
+
+            z0, x0, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
+                [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+            )
+
+            # 2. Convolution sequence transformation
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+
+            # 3. SSM transformation
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=gate.view(batch_size, self.num_heads, self.head_dim) if not self.mamba_rms_norm else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+
+            if self.mamba_rms_norm:
+                hidden_states = self.norm(hidden_states, gate)
+
+            if d_mlp > 0:
+                hidden_states = torch.cat([F.silu(z0) * x0, hidden_states], dim=-1)
+
+            # 4. Final linear projection
+            out = self.out_proj(hidden_states[:, None, ...])
+        # Fused calculations or step by step if no initialized cache is found
+        else:
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
+
+            # 2-4. Fused kernel for conv1d, SSM, and the final projection
+            if self.training and cache_params is None:
+                out = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,  # was seq_idx
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight if self.mamba_rms_norm else None,
+                    rmsnorm_eps=self.norm.variance_epsilon if self.mamba_rms_norm else None,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=False,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                d_mlp = (
+                    projected_states.shape[-1]
+                    - 2 * self.intermediate_size
+                    - 2 * self.n_groups * self.ssm_state_size
+                    - self.num_heads
+                ) // 2
+                if attention_mask is not None:
+                    projected_states = projected_states * attention_mask[..., None]
+                _, gate, hidden_states_B_C, dt = projected_states.split(
+                    [
+                        2 * d_mlp,
+                        self.intermediate_size,
+                        self.conv_dim,
+                        self.num_heads,
+                    ],
+                    dim=-1,
+                )
+
+                if cache_params is not None:
+                    conv_states = F.pad(
+                        hidden_states_B_C.permute(0, 2, 1),
+                        (self.conv_kernel_size - hidden_states_B_C.shape[-2], 0),
+                    )
+                    cache_params.update_conv_state(self.layer_idx, conv_states, cache_position)
+
+                time_step = nn.functional.softplus(dt + self.dt_bias)
+                # 1D Convolution
+                if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len]
+                    )  # (B, L, self.d_inner + 2 * ngroups * d_state)
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)[:, :seq_len]
+
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [
+                        self.intermediate_size,
+                        groups_time_state_size,
+                        groups_time_state_size,
+                    ],
+                    dim=-1,
+                )
+
+                if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    dtype = hidden_states.dtype
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+                # This is a hack to make sure multi-GPU inference works with HF accelerate
+                # see: https://github.com/Dao-AILab/flash-attention/issues/523 for more details
+                with torch.cuda.device(hidden_states.device):
+                    scan_output, ssm_state = mamba_chunk_scan_combined(
+                        hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                        time_step,
+                        A,
+                        B.view(batch_size, seq_len, self.n_groups, -1),
+                        C.view(batch_size, seq_len, self.n_groups, -1),
+                        chunk_size=self.chunk_size,
+                        D=self.D,
+                        z=None,
+                        seq_idx=None,
+                        return_final_states=True,
+                        **dt_limit_kwargs,
+                    )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                # Multiply "gate" branch and apply extra normalization layer
+                if self.mamba_rms_norm:
+                    out = self.norm(scan_output, gate)
+                else:
+                    out = scan_output * torch.nn.functional.silu(gate)
+                out = self.out_proj(out)
+        return out
+
+    # fmt: off
+    def torch_forward(
+        self,
+        input_states,
+        cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+
+        # 1. Gated MLP's linear projection
+        input_states = apply_mask_to_padding_states(input_states, attention_mask)
+        # Add Multipliers
+        input_states = input_states * self.ssm_in_multiplier
+        projected_states = self.in_proj(input_states)
+        projected_states = projected_states * self.mup_vector  # ADD Mup Multipliers
+        gate, hidden_states_B_C, dt = projected_states.split([
+                self.intermediate_size, self.conv_dim, self.num_heads
+            ], dim=-1)
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and cache_position is not None
+            and cache_position[0] > 0
+        )
+
+        # 2. Convolution sequence transformation
+        if use_precomputed_states:
+            cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1)
+            cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device)
+
+            # We need to guarantee that anything regarding the cache is on the same device
+            conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device)
+
+            hidden_states_B_C = torch.sum(
+                conv_states * self.conv1d.weight.squeeze(1), dim=-1
+            )
+            if self.use_conv_bias:
+                hidden_states_B_C = hidden_states_B_C + self.conv1d.bias
+            hidden_states_B_C = self.act(hidden_states_B_C)
+        else:
+            # Init cache
+            if cache_params is not None:
+                hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                conv_states = nn.functional.pad(
+                    hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+
+            hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+
+        hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+        hidden_states, B, C = torch.split(
+            hidden_states_B_C,
+            [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size],
+            dim=-1
+        )
+
+        # 3. SSM transformation
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if use_precomputed_states:
+            # We need to guarantee that anything regarding the cache is on the same device
+            cache_device = cache_params.ssm_states[self.layer_idx].device
+
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = (torch.exp(dt[..., None] * A)).to(device=cache_device)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = (dB * hidden_states[..., None]).to(device=cache_device)
+
+            # State calculation
+            cache_params.ssm_states[self.layer_idx].copy_(
+                cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # Contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+            # Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3)
+
+            # 2. Compute the state for each intra-chunk
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+            decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+            B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
+            states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
+
+            # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+            # (middle term of factorization of off-diag blocks; A terms)
+            if use_precomputed_states:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device)
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+            decay_chunk = decay_chunk.transpose(1, 3)
+            new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # 4. Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+
+            # Init cache
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        if self.mamba_rms_norm:
+            scan_output = self.norm(y, gate)
+        else:
+            scan_output = y * torch.nn.functional.silu(gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        dtype = hidden_states.dtype
+        if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+            # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+        return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+
+
+class FalconH1MLP(nn.Module):
+    def __init__(self, config: FalconH1Config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.gate_multiplier, self.down_multiplier = config.mlp_multipliers
+
+    def forward(self, x):
+        y = self.up_proj(x) * self.act_fn(self.gate_proj(x) * self.gate_multiplier)
+        y = self.down_proj(y) * self.down_multiplier
+        return y
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class FalconH1RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        FalconH1RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class FalconH1DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: FalconH1Config, layer_idx: int):
+        super().__init__()
+        self.feed_forward = FalconH1MLP(config)
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.channels_attn = config.num_attention_heads * head_dim + 2 * config.num_key_value_heads * head_dim
+
+        self.mamba = FalconH1Mixer(config=config, layer_idx=layer_idx)
+
+        self.self_attn = FalconH1Attention(config, layer_idx)
+
+        self.attention_in_multiplier = config.attention_in_multiplier
+        self.ssm_out_multiplier = config.ssm_out_multiplier
+        self.attn_out_multiplier = config.attention_out_multiplier
+
+        self.input_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        mamba_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`FalconHybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        mamba_hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_values,
+            cache_position=cache_position,
+            attention_mask=mamba_attention_mask,
+        )
+        mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier
+
+        attention_hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states * self.attention_in_multiplier,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        attention_hidden_states = attention_hidden_states * self.attn_out_multiplier
+
+        hidden_states = mamba_hidden_states + attention_hidden_states
+
+        # residual connection after attention
+        hidden_states = residual + hidden_states
+
+        # feed-forward
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class FalconH1PreTrainedModel(PreTrainedModel):
+    config: FalconH1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconH1DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        for name, param in module.named_parameters(recurse=True):
+            if not param.requires_grad:
+                continue
+            if "layernorm" in name.lower() and "weight" in name:
+                # LayerNorm weights usually initialized to 1
+                param.data.fill_(1.0)
+            elif "bias" in name:
+                param.data.zero_()
+            else:
+                try:
+                    param.data.normal_(mean=0.0, std=std)
+                except Exception as e:
+                    print(f"Skipping init for {name} due to error: {e}")
+
+
+def compute_mup_vector(config):
+    """
+    Computes the MuP vector based on model configuration.
+
+    FalconH1 applies different MuP multiplier for each dimension of the hidden states.
+    The MuP vector is partitioned into chunks, and each chunk is multiplied with its
+    corresponding projected dimension.
+
+    Args:
+        config: FalconH1Config object
+
+    Returns:
+        torch.Tensor: The computed MuP vector
+    """
+    # We'll need some values from the config to compute the vector dimensions
+    intermediate_size = (
+        config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size)
+    )
+    groups_time_state_size = config.mamba_n_groups * config.mamba_d_state
+    num_heads = config.mamba_n_heads
+    zxbcdt_multipliers = config.ssm_multipliers
+
+    vector_shape = 2 * intermediate_size + 2 * groups_time_state_size + num_heads
+    mup_vector = torch.ones(1, 1, vector_shape)
+
+    # Apply multipliers to different sections of the vector
+    mup_vector[:, :, :intermediate_size] *= zxbcdt_multipliers[0]
+    mup_vector[:, :, intermediate_size : 2 * intermediate_size] *= zxbcdt_multipliers[1]
+    mup_vector[:, :, 2 * intermediate_size : 2 * intermediate_size + groups_time_state_size] *= zxbcdt_multipliers[2]
+    mup_vector[
+        :, :, 2 * intermediate_size + groups_time_state_size : 2 * intermediate_size + 2 * groups_time_state_size
+    ] *= zxbcdt_multipliers[3]
+    mup_vector[:, :, 2 * intermediate_size + 2 * groups_time_state_size :] *= zxbcdt_multipliers[4]
+
+    return mup_vector
+
+
+@auto_docstring
+# Adapted from transformers.models.jamba.modeling_jamba.JambaModel
+class FalconH1Model(FalconH1PreTrainedModel):
+    def __init__(self, config: FalconH1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            decoder_layers.append(FalconH1DecoderLayer(config, layer_idx=i))
+        self.layers = nn.ModuleList(decoder_layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = FalconH1RotaryEmbedding(config=config)
+
+        self.embedding_multiplier = config.embedding_multiplier
+        self.lm_head_multiplier = config.lm_head_multiplier
+
+        self.gradient_checkpointing = False
+        # Compute the MuP vector once and register it for all layers
+        mup_vector = compute_mup_vector(config)
+        for layer in self.layers:
+            layer.mamba.register_buffer("mup_vector", mup_vector, persistent=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,  # NOOP kwargs, for now
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embedding_multiplier
+        hidden_states = inputs_embeds
+
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "FalconH1 requires an initialized `FalconHybridMambaAttentionDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                mamba_attention_mask=mamba_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        next_cache = None if not use_cache else past_key_values
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_mamba_mask(self, attention_mask, cache_position):
+        """
+        No need for zeroing states when
+            1. Cached forward
+            2. Attending to all inputs
+        """
+        mamba_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            mamba_mask = None
+        return mamba_mask
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: FalconHybridMambaAttentionDynamicCache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[
+                    :, :, -sequence_length:, :
+                ].to(dtype)
+                padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@auto_docstring
+class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = FalconH1Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FalconH1ForCausalLM
+
+        >>> model = FalconH1ForCausalLM.from_pretrained("...")
+        >>> tokenizer = AutoTokenizer.from_pretrained("...")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :]) * self.model.lm_head_multiplier
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwitten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = FalconHybridMambaAttentionDynamicCache(
+                self.config,
+                input_ids.shape[0],
+                self.dtype,
+                devices=[
+                    self.model.layers[i].mamba.conv1d.weight.device for i in range(self.config.num_hidden_layers)
+                ],
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+__all__ = ["FalconH1Model", "FalconH1ForCausalLM", "FalconH1PreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_h1/modular_falcon_h1.py b/phivenv/Lib/site-packages/transformers/models/falcon_h1/modular_falcon_h1.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ff8288f9c4f54da573f63af1a15c4beea470567
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_h1/modular_falcon_h1.py
@@ -0,0 +1,1380 @@
+# coding=utf-8
+# Copyright 2025 Technology Innovation Institute and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FalconH1 model."""
+
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.models.jamba.modeling_jamba import HybridMambaAttentionDynamicCache
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaForCausalLM,
+    LlamaMLP,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from transformers.models.mamba2.modeling_mamba2 import (
+    MambaRMSNormGated,
+    pad_tensor_by_size,
+    reshape_into_chunks,
+    segment_sum,
+)
+
+from ...cache_utils import Cache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
+from .configuration_falcon_h1 import FalconH1Config
+
+
+if is_mamba_2_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update = None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
+logger = logging.get_logger(__name__)
+
+
+class FalconHybridMambaAttentionDynamicCache(HybridMambaAttentionDynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        batch_size: int,
+        dtype: torch.dtype = torch.float16,
+        devices: Optional[list[str]] = None,
+    ):
+        self.seqlen_offset = 0
+        self.dtype = dtype
+        self.has_previous_state = False
+        self.conv_kernel_size = config.mamba_d_conv
+
+        self.intermediate_size = (
+            config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size)
+        )
+
+        self.conv_states = {
+            i: torch.zeros(
+                batch_size,
+                self.intermediate_size + 2 * config.mamba_n_groups * config.mamba_d_state,
+                self.conv_kernel_size,
+                device=devices[i],
+                dtype=dtype,
+            )
+            for i in range(config.num_hidden_layers)
+        }
+        self.ssm_states = {
+            i: torch.zeros(
+                batch_size,
+                config.mamba_n_heads,
+                config.mamba_d_head,
+                config.mamba_d_state,
+                device=devices[i],
+                dtype=dtype,
+            )
+            for i in range(config.num_hidden_layers)
+        }
+
+        self.transformer_layers = []
+        for i in range(config.num_hidden_layers):
+            self.transformer_layers.append(i)
+
+        self.key_cache: list[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            # There may be skipped layers, fill them with empty lists
+            for _ in range(len(self.key_cache), layer_idx):
+                self.key_cache.append([])
+                self.value_cache.append([])
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def update_conv_state(
+        self,
+        layer_idx: int,
+        new_conv_state: torch.Tensor,
+        cache_position: torch.LongTensor,
+    ) -> torch.Tensor:
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        if len(cache_position) > 1:
+            conv_state[:, :, :] = new_conv_state.to(conv_state.device)
+        else:
+            conv_state[:, :, -1] = new_conv_state[:, :, -1].to(conv_state.device)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+
+
+class FalconH1RotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class FalconH1Attention(LlamaAttention):
+    def __init__(self, config: FalconH1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.key_multiplier = config.key_multiplier
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) * self.key_multiplier
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class FalconH1RMSNormGated(MambaRMSNormGated):
+    def __init__(self, hidden_size, eps=1e-6, n_groups=1, norm_before_gate=True):
+        super().__init__(hidden_size=hidden_size, eps=eps)
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.n_groups = n_groups
+        self.norm_before_gate = norm_before_gate
+
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+
+        if not self.norm_before_gate and gate is not None:
+            hidden_states = hidden_states * F.silu(gate.to(torch.float32))
+
+        if len(hidden_states.shape) == 3:
+            batch_size, seq_len, dim = hidden_states.shape
+        else:
+            batch_size, dim = hidden_states.shape
+            seq_len = 1
+        hidden_states = hidden_states.to(torch.float32)
+
+        hidden_states = hidden_states.view(batch_size, seq_len, self.n_groups, int(dim // self.n_groups))
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        hidden_states = self.weight.view(self.n_groups, int(dim // self.n_groups)) * hidden_states
+        hidden_states = hidden_states.view(batch_size, seq_len, dim)
+
+        if seq_len == 1:
+            hidden_states = hidden_states.squeeze(1)
+
+        if self.norm_before_gate and gate is not None:
+            hidden_states = hidden_states * F.silu(gate.to(torch.float32))
+        return hidden_states.to(input_dtype)
+
+
+def apply_mask_to_padding_states(hidden_states, attention_mask):
+    """
+    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+    """
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+    return hidden_states
+
+
+# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
+class FalconH1Mixer(nn.Module):
+    """
+    FalconH1Mixer is identical to classic Mamba2 mixer classes but differs on two different things
+    - Users can pass custom intermediate_size through `config.mamba_d_ssm`
+    - The use of gated RMS normalization layer is optional
+    """
+
+    def __init__(self, config: FalconH1Config, layer_idx: int):
+        super().__init__()
+        self.num_heads = config.mamba_n_heads
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = (
+            int(config.mamba_expand * self.hidden_size) if config.mamba_d_ssm is None else config.mamba_d_ssm
+        )
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.mamba_conv_bias
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+        self.use_bias = config.mamba_proj_bias
+
+        self.layer_norm_epsilon = config.rms_norm_eps
+        self.groups_time_state_size = config.mamba_n_groups * self.ssm_state_size
+
+        self.n_groups = config.mamba_n_groups
+        self.head_dim = config.mamba_d_head
+        self.chunk_size = config.mamba_chunk_size
+
+        # FIXME:
+        self.time_step_limit = (0.0, float("inf"))
+        self.time_step_min = 0.001
+        self.time_step_max = 0.1
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=config.mamba_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.conv_dim,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=self.use_bias,
+        )
+        # selective projection used to make dt, B and C input dependant
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        self.mamba_rms_norm = config.mamba_rms_norm
+
+        if self.mamba_rms_norm:
+            self.norm = FalconH1RMSNormGated(
+                self.intermediate_size,
+                eps=self.layer_norm_epsilon,
+                n_groups=self.n_groups,
+                norm_before_gate=config.mamba_norm_before_gate,
+            )
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+
+        self.out_proj = nn.Linear(self.intermediate_size, config.hidden_size, bias=config.projectors_bias)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+        else:
+            logger.warning_once("The fast path for FalconH1 will be used when running the model on a GPU")
+
+        self.zxbcdt_multipliers = config.ssm_multipliers
+        self.ssm_in_multiplier = config.ssm_in_multiplier
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+        # Add Multipliers
+        hidden_states = hidden_states * self.ssm_in_multiplier
+        projected_states = self.in_proj(hidden_states)
+        projected_states = projected_states * self.mup_vector  # ADD Mup Multipliers
+        d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads
+
+        # Set up dimensions for reshapes later
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and cache_position is not None
+            and cache_position[0] > 0
+        )
+
+        # getting projected states from cache if it exists
+        if use_precomputed_states:
+            d_mlp = (projected_states.squeeze(1).shape[-1] - d_to_remove) // 2
+
+            z0, x0, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
+                [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+            )
+
+            # 2. Convolution sequence transformation
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+
+            # 3. SSM transformation
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=gate.view(batch_size, self.num_heads, self.head_dim) if not self.mamba_rms_norm else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+
+            if self.mamba_rms_norm:
+                hidden_states = self.norm(hidden_states, gate)
+
+            if d_mlp > 0:
+                hidden_states = torch.cat([F.silu(z0) * x0, hidden_states], dim=-1)
+
+            # 4. Final linear projection
+            out = self.out_proj(hidden_states[:, None, ...])
+        # Fused calculations or step by step if no initialized cache is found
+        else:
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
+
+            # 2-4. Fused kernel for conv1d, SSM, and the final projection
+            if self.training and cache_params is None:
+                out = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,  # was seq_idx
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight if self.mamba_rms_norm else None,
+                    rmsnorm_eps=self.norm.variance_epsilon if self.mamba_rms_norm else None,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=False,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                d_mlp = (
+                    projected_states.shape[-1]
+                    - 2 * self.intermediate_size
+                    - 2 * self.n_groups * self.ssm_state_size
+                    - self.num_heads
+                ) // 2
+                if attention_mask is not None:
+                    projected_states = projected_states * attention_mask[..., None]
+                _, gate, hidden_states_B_C, dt = projected_states.split(
+                    [
+                        2 * d_mlp,
+                        self.intermediate_size,
+                        self.conv_dim,
+                        self.num_heads,
+                    ],
+                    dim=-1,
+                )
+
+                if cache_params is not None:
+                    conv_states = F.pad(
+                        hidden_states_B_C.permute(0, 2, 1),
+                        (self.conv_kernel_size - hidden_states_B_C.shape[-2], 0),
+                    )
+                    cache_params.update_conv_state(self.layer_idx, conv_states, cache_position)
+
+                time_step = nn.functional.softplus(dt + self.dt_bias)
+                # 1D Convolution
+                if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len]
+                    )  # (B, L, self.d_inner + 2 * ngroups * d_state)
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)[:, :seq_len]
+
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [
+                        self.intermediate_size,
+                        groups_time_state_size,
+                        groups_time_state_size,
+                    ],
+                    dim=-1,
+                )
+
+                if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    dtype = hidden_states.dtype
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+                # This is a hack to make sure multi-GPU inference works with HF accelerate
+                # see: https://github.com/Dao-AILab/flash-attention/issues/523 for more details
+                with torch.cuda.device(hidden_states.device):
+                    scan_output, ssm_state = mamba_chunk_scan_combined(
+                        hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                        time_step,
+                        A,
+                        B.view(batch_size, seq_len, self.n_groups, -1),
+                        C.view(batch_size, seq_len, self.n_groups, -1),
+                        chunk_size=self.chunk_size,
+                        D=self.D,
+                        z=None,
+                        seq_idx=None,
+                        return_final_states=True,
+                        **dt_limit_kwargs,
+                    )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                # Multiply "gate" branch and apply extra normalization layer
+                if self.mamba_rms_norm:
+                    out = self.norm(scan_output, gate)
+                else:
+                    out = scan_output * torch.nn.functional.silu(gate)
+                out = self.out_proj(out)
+        return out
+
+    # fmt: off
+    def torch_forward(
+        self,
+        input_states,
+        cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+
+        # 1. Gated MLP's linear projection
+        input_states = apply_mask_to_padding_states(input_states, attention_mask)
+        # Add Multipliers
+        input_states = input_states * self.ssm_in_multiplier
+        projected_states = self.in_proj(input_states)
+        projected_states = projected_states * self.mup_vector  # ADD Mup Multipliers
+        gate, hidden_states_B_C, dt = projected_states.split([
+                self.intermediate_size, self.conv_dim, self.num_heads
+            ], dim=-1)
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and cache_position is not None
+            and cache_position[0] > 0
+        )
+
+        # 2. Convolution sequence transformation
+        if use_precomputed_states:
+            cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1)
+            cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device)
+
+            # We need to guarantee that anything regarding the cache is on the same device
+            conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device)
+
+            hidden_states_B_C = torch.sum(
+                conv_states * self.conv1d.weight.squeeze(1), dim=-1
+            )
+            if self.use_conv_bias:
+                hidden_states_B_C = hidden_states_B_C + self.conv1d.bias
+            hidden_states_B_C = self.act(hidden_states_B_C)
+        else:
+            # Init cache
+            if cache_params is not None:
+                hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                conv_states = nn.functional.pad(
+                    hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+
+            hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+
+        hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+        hidden_states, B, C = torch.split(
+            hidden_states_B_C,
+            [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size],
+            dim=-1
+        )
+
+        # 3. SSM transformation
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if use_precomputed_states:
+            # We need to guarantee that anything regarding the cache is on the same device
+            cache_device = cache_params.ssm_states[self.layer_idx].device
+
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = (torch.exp(dt[..., None] * A)).to(device=cache_device)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = (dB * hidden_states[..., None]).to(device=cache_device)
+
+            # State calculation
+            cache_params.ssm_states[self.layer_idx].copy_(
+                cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # Contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+            # Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3)
+
+            # 2. Compute the state for each intra-chunk
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+            decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+            B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
+            states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
+
+            # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+            # (middle term of factorization of off-diag blocks; A terms)
+            if use_precomputed_states:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device)
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+            decay_chunk = decay_chunk.transpose(1, 3)
+            new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # 4. Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+
+            # Init cache
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        if self.mamba_rms_norm:
+            scan_output = self.norm(y, gate)
+        else:
+            scan_output = y * torch.nn.functional.silu(gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        dtype = hidden_states.dtype
+        if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+            # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+        return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+
+
+class FalconH1MLP(LlamaMLP):
+    def __init__(self, config: FalconH1Config):
+        super().__init__(config)
+        self.gate_multiplier, self.down_multiplier = config.mlp_multipliers
+
+    def forward(self, x):
+        y = self.up_proj(x) * self.act_fn(self.gate_proj(x) * self.gate_multiplier)
+        y = self.down_proj(y) * self.down_multiplier
+        return y
+
+
+class FalconH1RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class FalconH1DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: FalconH1Config, layer_idx: int):
+        super().__init__()
+        self.feed_forward = FalconH1MLP(config)
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.channels_attn = config.num_attention_heads * head_dim + 2 * config.num_key_value_heads * head_dim
+
+        self.mamba = FalconH1Mixer(config=config, layer_idx=layer_idx)
+
+        self.self_attn = FalconH1Attention(config, layer_idx)
+
+        self.attention_in_multiplier = config.attention_in_multiplier
+        self.ssm_out_multiplier = config.ssm_out_multiplier
+        self.attn_out_multiplier = config.attention_out_multiplier
+
+        self.input_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        mamba_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`FalconHybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        mamba_hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_values,
+            cache_position=cache_position,
+            attention_mask=mamba_attention_mask,
+        )
+        mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier
+
+        attention_hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states * self.attention_in_multiplier,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        attention_hidden_states = attention_hidden_states * self.attn_out_multiplier
+
+        hidden_states = mamba_hidden_states + attention_hidden_states
+
+        # residual connection after attention
+        hidden_states = residual + hidden_states
+
+        # feed-forward
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class FalconH1PreTrainedModel(PreTrainedModel):
+    config: FalconH1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconH1DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        for name, param in module.named_parameters(recurse=True):
+            if not param.requires_grad:
+                continue
+            if "layernorm" in name.lower() and "weight" in name:
+                # LayerNorm weights usually initialized to 1
+                param.data.fill_(1.0)
+            elif "bias" in name:
+                param.data.zero_()
+            else:
+                try:
+                    param.data.normal_(mean=0.0, std=std)
+                except Exception as e:
+                    print(f"Skipping init for {name} due to error: {e}")
+
+
+def compute_mup_vector(config):
+    """
+    Computes the MuP vector based on model configuration.
+
+    FalconH1 applies different MuP multiplier for each dimension of the hidden states.
+    The MuP vector is partitioned into chunks, and each chunk is multiplied with its
+    corresponding projected dimension.
+
+    Args:
+        config: FalconH1Config object
+
+    Returns:
+        torch.Tensor: The computed MuP vector
+    """
+    # We'll need some values from the config to compute the vector dimensions
+    intermediate_size = (
+        config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size)
+    )
+    groups_time_state_size = config.mamba_n_groups * config.mamba_d_state
+    num_heads = config.mamba_n_heads
+    zxbcdt_multipliers = config.ssm_multipliers
+
+    vector_shape = 2 * intermediate_size + 2 * groups_time_state_size + num_heads
+    mup_vector = torch.ones(1, 1, vector_shape)
+
+    # Apply multipliers to different sections of the vector
+    mup_vector[:, :, :intermediate_size] *= zxbcdt_multipliers[0]
+    mup_vector[:, :, intermediate_size : 2 * intermediate_size] *= zxbcdt_multipliers[1]
+    mup_vector[:, :, 2 * intermediate_size : 2 * intermediate_size + groups_time_state_size] *= zxbcdt_multipliers[2]
+    mup_vector[
+        :, :, 2 * intermediate_size + groups_time_state_size : 2 * intermediate_size + 2 * groups_time_state_size
+    ] *= zxbcdt_multipliers[3]
+    mup_vector[:, :, 2 * intermediate_size + 2 * groups_time_state_size :] *= zxbcdt_multipliers[4]
+
+    return mup_vector
+
+
+@auto_docstring
+# Adapted from transformers.models.jamba.modeling_jamba.JambaModel
+class FalconH1Model(FalconH1PreTrainedModel):
+    def __init__(self, config: FalconH1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            decoder_layers.append(FalconH1DecoderLayer(config, layer_idx=i))
+        self.layers = nn.ModuleList(decoder_layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = FalconH1RotaryEmbedding(config=config)
+
+        self.embedding_multiplier = config.embedding_multiplier
+        self.lm_head_multiplier = config.lm_head_multiplier
+
+        self.gradient_checkpointing = False
+        # Compute the MuP vector once and register it for all layers
+        mup_vector = compute_mup_vector(config)
+        for layer in self.layers:
+            layer.mamba.register_buffer("mup_vector", mup_vector, persistent=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,  # NOOP kwargs, for now
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embedding_multiplier
+        hidden_states = inputs_embeds
+
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "FalconH1 requires an initialized `FalconHybridMambaAttentionDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                mamba_attention_mask=mamba_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        next_cache = None if not use_cache else past_key_values
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_mamba_mask(self, attention_mask, cache_position):
+        """
+        No need for zeroing states when
+            1. Cached forward
+            2. Attending to all inputs
+        """
+        mamba_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            mamba_mask = None
+        return mamba_mask
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: FalconHybridMambaAttentionDynamicCache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[
+                    :, :, -sequence_length:, :
+                ].to(dtype)
+                padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class FalconH1ForCausalLM(LlamaForCausalLM):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FalconH1ForCausalLM
+
+        >>> model = FalconH1ForCausalLM.from_pretrained("...")
+        >>> tokenizer = AutoTokenizer.from_pretrained("...")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :]) * self.model.lm_head_multiplier
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwitten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = FalconHybridMambaAttentionDynamicCache(
+                self.config,
+                input_ids.shape[0],
+                self.dtype,
+                devices=[
+                    self.model.layers[i].mamba.conv1d.weight.device for i in range(self.config.num_hidden_layers)
+                ],
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+__all__ = ["FalconH1Model", "FalconH1ForCausalLM", "FalconH1PreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__init__.py b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..202147c938465dd7dfcb7e79ecbeeb93ce632dbf
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_falcon_mamba import *
+    from .modeling_falcon_mamba import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b6aaacb560061a121f5c0cb61545c4359384f6e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18e7522f700e55821da80fbe3087245bea5bc665
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbcb76e21f45f859acd630849ec0980af65eb6db
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da8774f6446da3d80cc99fd32880f21d834d2aaf
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/configuration_falcon_mamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..7630ebd6343ac968303fc0c31f2742bb352b4f8a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/configuration_falcon_mamba.py
@@ -0,0 +1,170 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/falcon_mamba/modular_falcon_mamba.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_falcon_mamba.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from ...configuration_utils import PretrainedConfig
+
+
+class FalconMambaConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FalconMambaModel`]. It is used to instantiate a FALCON_MAMBA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the FALCON_MAMBA
+    [tiiuae/falcon-mamba-7b](https://huggingface.co/tiiuae/falcon-mamba-7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50280):
+            Vocabulary size of the FALCON_MAMBA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconMambaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        state_size (`int`, *optional*, defaults to 16): shape of the state space latents.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the model.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary.
+        expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
+        conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block
+        use_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the convolution layer of the mixer block.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.1):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        residual_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model
+        time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_scale (`float`, *optional*, defaults to 1.0):
+            Scale used used to scale `dt_proj.bias`.
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_init_scheme (`float`, *optional*, defaults to `"random"`):
+            Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]`
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `False`):
+            Whether or not to rescale `out_proj` weights when initializing.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the cache should be used.
+        use_falcon_mambapy (`bool`, *optional*, defaults to `False`):
+            This argument corresponds to `use_mambapy` in MambaConfig.
+            Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
+        mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
+            The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import FalconMambaConfig, FalconMambaModel
+
+    >>> # Initializing a FalconMamba configuration
+    >>> configuration = FalconMambaConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = FalconMambaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "falcon_mamba"
+
+    def __init__(
+        self,
+        vocab_size=50280,
+        hidden_size=768,
+        state_size=16,
+        num_hidden_layers=32,
+        layer_norm_epsilon=1e-5,
+        pad_token_id=0,
+        bos_token_id=0,
+        eos_token_id=0,
+        expand=2,
+        conv_kernel=4,
+        use_bias=False,
+        use_conv_bias=True,
+        hidden_act="silu",
+        initializer_range=0.1,
+        residual_in_fp32=True,
+        time_step_rank="auto",
+        time_step_scale=1.0,
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_init_scheme="random",
+        time_step_floor=1e-4,
+        rescale_prenorm_residual=False,
+        use_cache=True,
+        use_falcon_mambapy=False,
+        mixer_rms_eps=1e-6,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.state_size = state_size
+        self.num_hidden_layers = num_hidden_layers
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.conv_kernel = conv_kernel
+        self.expand = expand
+        # This is needed since mamba overrides the intermediate_size attribute
+        self.intermediate_size = (
+            int(expand * self.hidden_size)
+            if kwargs.get("intermediate_size") is None
+            else kwargs.get("intermediate_size")
+        )
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.use_bias = use_bias
+        self.use_conv_bias = use_conv_bias
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_scale = time_step_scale
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_init_scheme = time_step_init_scheme
+        self.time_step_floor = time_step_floor
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.residual_in_fp32 = residual_in_fp32
+        self.use_cache = use_cache
+        self.use_falcon_mambapy = use_falcon_mambapy
+        self.mixer_rms_eps = mixer_rms_eps
+
+
+__all__ = ["FalconMambaConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/modeling_falcon_mamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a60b2b6bf65c0eb3a93296c12fbf43d178c2b80
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/modeling_falcon_mamba.py
@@ -0,0 +1,910 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/falcon_mamba/modular_falcon_mamba.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_falcon_mamba.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...configuration_utils import PretrainedConfig
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.import_utils import (
+    is_causal_conv1d_available,
+    is_mamba_ssm_available,
+    is_mambapy_available,
+)
+from .configuration_falcon_mamba import FalconMambaConfig
+
+
+if is_mambapy_available():
+    from mambapy.pscan import pscan
+else:
+    pscan = None
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+
+    from ...kernels.falcon_mamba import mamba_inner_fn
+else:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+
+logger = logging.get_logger(__name__)
+
+
+class FalconMambaCache:
+    """
+    Cache for falcon_mamba model which does not have attention mechanism and key value states.
+
+    Arguments:
+        config (`PretrainedConfig):
+            The configuration file defining the shape-related attributes required to initialize the static cache.
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a smaller batch size is used.
+        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
+            The default `dtype` to use when initializing the layer.
+        device (`torch.device` or `str`, *optional*):
+            The device on which the cache should be initialized. Should be the same as the layer.
+
+    Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FalconMambaForCausalLM, FalconMambaCache
+
+        >>> model = FalconMambaForCausalLM.from_pretrained("state-spaces/falcon_mamba-130m-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/falcon_mamba-130m-hf")
+
+        >>> inputs = tokenizer(text="My name is FalconMamba", return_tensors="pt")
+
+        >>> # Prepare a cache class and pass it to model's forward
+        >>> past_key_values = FalconMambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
+        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+        >>> outputs.past_key_values
+        FalconMambaCache()
+        ```
+    """
+
+    is_compileable = True
+
+    # TODO (joao): add layer_device_map arg and update code in `generate` accordingly
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        max_batch_size: int,
+        dtype: torch.dtype = torch.float16,
+        device: Union[torch.device, str, None] = None,
+    ):
+        self.max_batch_size = max_batch_size
+        self._dtype = dtype
+        self.intermediate_size = config.intermediate_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+
+        self.conv_states: list[torch.Tensor] = []
+        self.ssm_states: list[torch.Tensor] = []
+        device = torch.device(device) if device is not None else None
+        for _ in range(config.num_hidden_layers):
+            conv_state: torch.Tensor = torch.zeros(
+                self.max_batch_size,
+                self.intermediate_size,
+                self.conv_kernel_size,
+                device=device,
+                dtype=self._dtype,
+            )
+            ssm_state: torch.Tensor = torch.zeros(
+                self.max_batch_size,
+                self.intermediate_size,
+                self.ssm_state_size,
+                device=device,
+                dtype=self._dtype,
+            )
+
+            torch._dynamo.mark_static_address(conv_state)
+            torch._dynamo.mark_static_address(ssm_state)
+            self.conv_states.append(conv_state)
+            self.ssm_states.append(ssm_state)
+
+    def update_conv_state(
+        self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor
+    ) -> torch.Tensor:
+        # This `if` blocks is only reached in multigpu and if `layer_device_map` is not passed. It is used
+        # when the cache is initialized in the forward pass (e.g. FalconMamba)
+        if self.conv_states[layer_idx].device != new_conv_state.device:
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].to(new_conv_state.device)
+
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        conv_state[:, :, cache_position] = new_conv_state.to(device=conv_state.device, dtype=conv_state.dtype)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+
+    def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
+        self.ssm_states[layer_idx].zero_()
+        self.ssm_states[layer_idx] += new_ssm_state.to(self.ssm_states[layer_idx].device)
+        return self.ssm_states[layer_idx]
+
+    def reset(self):
+        for layer_idx in range(len(self.conv_states)):
+            # In-place ops prevent breaking the static address
+            self.conv_states[layer_idx].zero_()
+            self.ssm_states[layer_idx].zero_()
+
+
+def rms_forward(hidden_states, variance_epsilon=1e-6):
+    """
+    Calculates simple RMSNorm with no learnable weights. `MambaRMSNorm` will
+    leverage this in order to multiply the final result with the RMSNorm weight
+
+    Args:
+        hidden_states (`torch.Tensor`):
+            Hidden states to normalize
+        variance_epsilon (`float`):
+            The eps value to add in the square root scaling factor
+    """
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return hidden_states.to(input_dtype)
+
+
+class FalconMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see FalconMamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between FalconMamba and the linear time invariant S4,
+    and is why FalconMamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: FalconMambaConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = config.intermediate_size
+        self.time_step_rank = int(config.time_step_rank)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.conv1d = nn.Conv1d(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias=config.use_conv_bias,
+            kernel_size=config.conv_kernel,
+            groups=self.intermediate_size,
+            padding=config.conv_kernel - 1,
+        )
+
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+
+        self.use_falcon_mambapy = config.use_falcon_mambapy
+
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
+        # time step projection (discretization)
+        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
+        A = A.expand(self.intermediate_size, -1).contiguous()
+
+        self.A_log = nn.Parameter(torch.log(A))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size))
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.use_bias = config.use_bias
+
+        self.warn_slow_implementation()
+        # Triton expects to pass RMS weights even if they are non learnable, thus we need to create these weights here
+        self.register_buffer(
+            "b_c_rms", torch.nn.Parameter(torch.ones(self.ssm_state_size), requires_grad=False), persistent=False
+        )
+        self.register_buffer(
+            "dt_rms", torch.nn.Parameter(torch.ones(self.intermediate_size), requires_grad=False), persistent=False
+        )
+        self.rms_eps = config.mixer_rms_eps
+
+    def warn_slow_implementation(self):
+        is_fast_path_available = all(
+            (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+        )
+        if not is_fast_path_available:
+            if self.use_falcon_mambapy:
+                if is_mambapy_available():
+                    logger.warning_once(
+                        "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                        " is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and"
+                        " https://github.com/Dao-AILab/causal-conv1d"
+                    )
+                else:
+                    raise ImportError(
+                        "use_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py."
+                    )
+            else:
+                logger.warning_once(
+                    "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                    " is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and"
+                    " https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py."
+                )
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+
+        if self.training and cache_params is None:  # Doesn't support outputting the states -> used for training
+            contextualized_states = mamba_inner_fn(
+                projected_states,
+                self.conv1d.weight,
+                self.conv1d.bias if self.use_conv_bias else None,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias.float() if self.use_bias else None,
+                -torch.exp(self.A_log.float()),
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                b_rms_weight=self.b_c_rms,
+                c_rms_weight=self.b_c_rms,
+                dt_rms_weight=self.dt_rms,
+                b_c_dt_rms_eps=self.rms_eps,
+            )
+
+        else:
+            hidden_states, gate = projected_states.chunk(2, dim=1)
+
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+            # 2. Convolution sequence transformation
+            conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+            if cache_params is not None and cache_position[0] > 0:
+                hidden_states = causal_conv1d_update(
+                    hidden_states.squeeze(-1),
+                    cache_params.conv_states[self.layer_idx],
+                    conv_weights,
+                    self.conv1d.bias,
+                    self.activation,
+                )
+                hidden_states = hidden_states.unsqueeze(-1)
+            else:
+                if cache_params is not None:
+                    conv_states = nn.functional.pad(
+                        hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                    )
+                    cache_params.update_conv_state(self.layer_idx, conv_states, cache_position)
+                hidden_states = causal_conv1d_fn(
+                    hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
+                )
+
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+            # 3. State Space Model sequence transformation
+            # 3.a. input varying initialization of time_step, B and C
+            ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+            time_step, B, C = torch.split(
+                ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+            )
+
+            B = rms_forward(B, variance_epsilon=self.rms_eps)
+            C = rms_forward(C, variance_epsilon=self.rms_eps)
+            time_step = rms_forward(time_step, variance_epsilon=self.rms_eps)
+
+            # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module
+            # at the price of a small overhead.
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2)
+            else:
+                discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
+
+            A = -torch.exp(self.A_log.float())
+            # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+            time_proj_bias = self.dt_proj.bias.float() if hasattr(self.dt_proj, "bias") else None
+            if cache_params is not None and cache_position[0] > 0:
+                scan_outputs = selective_state_update(
+                    cache_params.ssm_states[self.layer_idx],
+                    hidden_states[..., 0],
+                    discrete_time_step[..., 0],
+                    A,
+                    B[:, 0],
+                    C[:, 0],
+                    self.D,
+                    gate[..., 0],
+                    time_proj_bias,
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+            else:
+                scan_outputs, ssm_state = selective_scan_fn(
+                    hidden_states,
+                    discrete_time_step,
+                    A,
+                    B.transpose(1, 2),
+                    C.transpose(1, 2),
+                    self.D.float(),
+                    gate,
+                    time_proj_bias,
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.update_ssm_state(self.layer_idx, ssm_state)
+
+            # 4. Final linear projection
+            contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+        return contextualized_states
+
+    # fmt: off
+    def slow_forward(self,
+        input_states,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(input_states).transpose(1, 2)  # [batch, 2 * intermediate_size, seq_len]
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 2. Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            # use `cache_position.shape[0]` to check whether we are in prefill
+            # stage, it's equivalent to check `cache_position[0] == 0`, which
+            # breaks dynamo fullgraph constraints
+            if cache_position is not None and cache_position.shape[0] == self.conv_kernel_size:
+                conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+
+                cache_params.update_conv_state(self.layer_idx, conv_state, cache_position)
+                hidden_states = self.act(
+                    self.conv1d(hidden_states)[..., :seq_len]
+                )  # [batch, intermediate_size, seq_len]
+            else:
+                conv_state = cache_params.update_conv_state(self.layer_idx, hidden_states, cache_position)
+                conv_state = conv_state.to(self.conv1d.weight.device)
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = (
+                    self.act(hidden_states).to(dtype).unsqueeze(-1)
+                )  # [batch, intermediate_size, 1] : decoding
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.intermediate_size, self.ssm_state_size), device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])  # [batch, intermediate_size, seq_len]
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+
+        B = rms_forward(B, variance_epsilon=self.rms_eps)
+        C = rms_forward(C, variance_epsilon=self.rms_eps)
+        time_step = rms_forward(time_step, variance_epsilon=self.rms_eps)
+
+        discrete_time_step = self.dt_proj(time_step)  # [batch, seq_len, intermediate_size]
+        discrete_time_step = nn.functional.softplus(discrete_time_step).transpose(
+            1, 2
+        )  # [batch, intermediate_size, seq_len]
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -torch.exp(self.A_log.float())  # [intermediate_size, ssm_state_size]
+        discrete_A = torch.exp(
+            A[None, :, None, :] * discrete_time_step[:, :, :, None]
+        )  # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_B = (
+            discrete_time_step[:, :, :, None] * B[:, None, :, :].float()
+        )  # [batch, intermediate_size, seq_len, ssm_state_size]
+        deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
+
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        if self.use_falcon_mambapy and self.training and cache_params is None:
+            hs = pscan(
+                discrete_A.transpose(1, 2), deltaB_u.transpose(1, 2)
+            )  # [batch, seq_len, intermediate_size, ssm_state_size]
+            scan_output = (hs @ C.unsqueeze(-1)).squeeze(3).transpose(1, 2)  # [batch, intermediate_size, seq_len]
+            scan_output = scan_output + hidden_states * self.D[None, :, None]
+            scan_output = scan_output * self.act(gate)
+        else:
+            scan_outputs = []
+            for i in range(seq_len):
+                ssm_state = (
+                    discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]
+                )  # [batch, intermediate_size, ssm_state]
+                scan_output = torch.matmul(
+                    ssm_state.to(dtype), C[:, i, :].unsqueeze(-1)
+                )  # [batch, intermediate_size, 1]
+                scan_outputs.append(scan_output[:, :, 0])
+            scan_output = torch.stack(scan_outputs, dim=-1)  # [batch, intermediate_size, seq_len]
+            scan_output = scan_output + (hidden_states * self.D[None, :, None])
+            scan_output = scan_output * self.act(gate)
+
+            if cache_params is not None:
+                cache_params.update_ssm_state(self.layer_idx, ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.transpose(1, 2))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        is_fast_path_available = all(
+            (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+        )
+        if is_fast_path_available and "cuda" in self.x_proj.weight.device.type and not torch._dynamo.is_compiling():
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask)
+
+
+class FalconMambaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        FalconMambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        return self.weight.to(hidden_states.device) * rms_forward(
+            hidden_states, variance_epsilon=self.variance_epsilon
+        )
+
+    def extra_repr(self):
+        return f"{self.weight.shape[0]}, eps={self.variance_epsilon}"
+
+
+class FalconMambaBlock(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.norm = FalconMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.mixer = FalconMambaMixer(config, layer_idx=layer_idx)
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+
+        hidden_states = self.mixer(
+            hidden_states, cache_params=cache_params, cache_position=cache_position, attention_mask=attention_mask
+        )
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class FalconMambaPreTrainedModel(PreTrainedModel):
+    config: FalconMambaConfig
+    base_model_prefix = "backbone"
+    _no_split_modules = ["FalconMambaBlock", "FalconMambaMixer"]
+    supports_gradient_checkpointing = True
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        std = self.config.initializer_range
+        if isinstance(module, FalconMambaMixer):
+            # S4D real initialization. These are not discretized!
+            # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+            A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
+            A = A.expand(module.intermediate_size, -1).contiguous()
+            module.A_log.copy_(torch.log(A))
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+            module.D.data.fill_(1.0)
+
+            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
+            if self.config.time_step_init_scheme == "constant":
+                nn.init.constant_(module.dt_proj.weight, dt_init_std)
+            elif self.config.time_step_init_scheme == "random":
+                nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)
+
+            dt = torch.exp(
+                torch.rand(self.config.intermediate_size)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            module.dt_proj.bias.copy_(inv_dt)
+            module.dt_proj.bias._no_reinit = True
+
+            nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
+            if module.conv1d.bias is not None:
+                if not getattr(module.conv1d.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.conv1d.bias)
+            nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
+
+            if self.config.rescale_prenorm_residual:
+                # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+                #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+                #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+                #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+                #
+                # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                p = module.out_proj.weight
+                p /= math.sqrt(self.config.num_hidden_layers)
+
+        if isinstance(module, nn.Linear):
+            if not getattr(module.weight, "_no_reinit", False):
+                nn.init.normal_(module.weight, std=std)
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, FalconMambaRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=std)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for the FALCON_MAMBA model outputs.
+    """
+)
+class FalconMambaOutput(ModelOutput):
+    r"""
+    cache_params (`FalconMambaCache`):
+        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+        avoid providing the old `input_ids`.
+
+        Includes both the State space model state matrices after the selective scan, and the Convolutional states
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    cache_params: Optional[FalconMambaCache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for causal language model (or autoregressive) outputs.
+    """
+)
+class FalconMambaCausalLMOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    cache_params (`FalconMambaCache`):
+        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+        avoid providing the old `input_ids`.
+
+        Includes both the State space model state matrices after the selective scan, and the Convolutional states
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    cache_params: Optional[FalconMambaCache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+class FalconMambaModel(FalconMambaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [FalconMambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]
+        )
+
+        self.gradient_checkpointing = False
+        self.norm_f = FalconMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        cache_params: Optional[FalconMambaCache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, FalconMambaOutput]:
+        r"""
+        cache_params (`FalconMambaCache`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        use_cache (`bool`, *optional*):
+            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+
+        if use_cache:
+            if cache_params is None:
+                cache_params = FalconMambaCache(
+                    self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype
+                )
+                cache_position = torch.arange(0, self.config.conv_kernel, device=inputs_embeds.device)
+            elif cache_position is None:
+                # cases when we do manual forward instead of using `model.generate` which will initiate
+                # `cache_position` and makes sure it is not None, throw error here instead of doing some
+                # hack to conjecture the current cache position
+                raise ValueError(
+                    "You have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, "
+                    "you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will "
+                    "be initialized for you automatically"
+                )
+        else:
+            cache_params = None
+
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        for mixer_block in self.layers:
+            hidden_states = mixer_block(
+                hidden_states,
+                cache_params=cache_params,
+                cache_position=cache_position,
+                attention_mask=attention_mask,
+            )
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.norm_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
+
+        return FalconMambaOutput(
+            last_hidden_state=hidden_states,
+            cache_params=cache_params if use_cache else None,
+            hidden_states=all_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The FALCON_MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class FalconMambaForCausalLM(FalconMambaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = FalconMambaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        return self.backbone.set_input_embeddings(new_embeddings)
+
+    def _update_model_kwargs_for_generation(
+        self, outputs: ModelOutput, model_kwargs: dict[str, Any], num_new_tokens: int = 1, **kwargs
+    ) -> dict[str, Any]:
+        model_kwargs["cache_params"] = outputs.get("cache_params", None)
+        if (
+            model_kwargs.get("use_cache", True)
+            and "cache_position" in model_kwargs
+            and model_kwargs["cache_position"] is not None
+        ):
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
+
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        # Overwritten -- uses `cache_params` as opposed to `past_key_values`
+        model_inputs = {"input_ids": input_ids.contiguous()}
+        if use_cache and cache_params is None:
+            # we initialize the `cache_position` to full size of `conv_states` at prefill stage
+            # considering padding will be applied when input length is shorter, and truncation
+            # will be applied when it is longer, so it will be equivalent to always have it match
+            # the length of `cache_params.conv_states`, which is `config.conv_kernel`
+            cache_position = torch.arange(0, self.backbone.config.conv_kernel, device=input_ids.device)
+            if inputs_embeds is not None:
+                model_inputs = {"inputs_embeds": inputs_embeds}
+                max_batch_size = inputs_embeds.size(0)
+            else:
+                max_batch_size = input_ids.size(0)
+            cache_params = FalconMambaCache(self.backbone.config, max_batch_size, device=self.device, dtype=self.dtype)
+
+        if use_cache and cache_position[0] > 0:
+            model_inputs["input_ids"] = input_ids[:, -1].unsqueeze(-1).contiguous()
+            attention_mask = None
+
+        if not use_cache and inputs_embeds is not None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+
+        model_inputs.update(
+            {
+                "cache_params": cache_params,
+                "use_cache": use_cache,
+                "cache_position": cache_position,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_params: Optional[FalconMambaCache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,  # for now we need this for generation
+    ) -> Union[tuple, FalconMambaCausalLMOutput]:
+        r"""
+        cache_params (`FalconMambaCache`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        use_cache (`bool`, *optional*):
+            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        falcon_mamba_outputs = self.backbone(
+            input_ids,
+            cache_params=cache_params,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+        )
+        hidden_states = falcon_mamba_outputs[0]
+
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + falcon_mamba_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return FalconMambaCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            cache_params=falcon_mamba_outputs.cache_params,
+            hidden_states=falcon_mamba_outputs.hidden_states,
+        )
+
+
+__all__ = ["FalconMambaForCausalLM", "FalconMambaModel", "FalconMambaPreTrainedModel", "FalconMambaCache"]
diff --git a/phivenv/Lib/site-packages/transformers/models/falcon_mamba/modular_falcon_mamba.py b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/modular_falcon_mamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..090a147d31e2260488c7b0d3f73c7526b2348494
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/falcon_mamba/modular_falcon_mamba.py
@@ -0,0 +1,546 @@
+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FALCONMAMBA model."""
+
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...utils import auto_docstring, logging
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available, is_mambapy_available
+from ..mamba.configuration_mamba import MambaConfig
+from ..mamba.modeling_mamba import (
+    MambaBlock,
+    MambaCache,
+    MambaCausalLMOutput,
+    MambaForCausalLM,
+    MambaMixer,
+    MambaModel,
+    MambaOutput,
+    MambaPreTrainedModel,
+    MambaRMSNorm,
+)
+
+
+logger = logging.get_logger(__name__)
+
+if is_mambapy_available():
+    from mambapy.pscan import pscan
+else:
+    pscan = None
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+
+    from ...kernels.falcon_mamba import mamba_inner_fn
+else:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+
+class FalconMambaConfig(MambaConfig):
+    """
+    This is the configuration class to store the configuration of a [`FalconMambaModel`]. It is used to instantiate a FALCON_MAMBA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the FALCON_MAMBA
+    [tiiuae/falcon-mamba-7b](https://huggingface.co/tiiuae/falcon-mamba-7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50280):
+            Vocabulary size of the FALCON_MAMBA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconMambaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        state_size (`int`, *optional*, defaults to 16): shape of the state space latents.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the model.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary.
+        expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
+        conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block
+        use_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the convolution layer of the mixer block.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.1):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        residual_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model
+        time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_scale (`float`, *optional*, defaults to 1.0):
+            Scale used used to scale `dt_proj.bias`.
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_init_scheme (`float`, *optional*, defaults to `"random"`):
+            Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]`
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `False`):
+            Whether or not to rescale `out_proj` weights when initializing.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the cache should be used.
+        use_falcon_mambapy (`bool`, *optional*, defaults to `False`):
+            This argument corresponds to `use_mambapy` in MambaConfig.
+            Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
+        mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
+            The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import FalconMambaConfig, FalconMambaModel
+
+    >>> # Initializing a FalconMamba configuration
+    >>> configuration = FalconMambaConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = FalconMambaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    def __init__(
+        self,
+        vocab_size=50280,
+        hidden_size=768,
+        state_size=16,
+        num_hidden_layers=32,
+        layer_norm_epsilon=1e-5,
+        pad_token_id=0,
+        bos_token_id=0,
+        eos_token_id=0,
+        expand=2,
+        conv_kernel=4,
+        use_bias=False,
+        use_conv_bias=True,
+        hidden_act="silu",
+        initializer_range=0.1,
+        residual_in_fp32=True,
+        time_step_rank="auto",
+        time_step_scale=1.0,
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_init_scheme="random",
+        time_step_floor=1e-4,
+        rescale_prenorm_residual=False,
+        use_cache=True,
+        use_falcon_mambapy=False,
+        mixer_rms_eps=1e-6,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            state_size=state_size,
+            num_hidden_layers=num_hidden_layers,
+            layer_norm_epsilon=layer_norm_epsilon,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            expand=expand,
+            conv_kernel=conv_kernel,
+            use_bias=use_bias,
+            use_conv_bias=use_conv_bias,
+            hidden_act=hidden_act,
+            initializer_range=initializer_range,
+            residual_in_fp32=residual_in_fp32,
+            time_step_rank=time_step_rank,
+            time_step_scale=time_step_scale,
+            time_step_min=time_step_min,
+            time_step_max=time_step_max,
+            time_step_init_scheme=time_step_init_scheme,
+            time_step_floor=time_step_floor,
+            rescale_prenorm_residual=rescale_prenorm_residual,
+            use_cache=use_cache,
+            use_falcon_mambapy=use_falcon_mambapy,
+            **kwargs,
+        )
+        self.mixer_rms_eps = mixer_rms_eps
+        # This is needed since mamba overrides the intermediate_size attribute
+        self.intermediate_size = (
+            int(expand * self.hidden_size)
+            if kwargs.get("intermediate_size") is None
+            else kwargs.get("intermediate_size")
+        )
+
+
+class FalconMambaCache(MambaCache):
+    pass
+
+
+def rms_forward(hidden_states, variance_epsilon=1e-6):
+    """
+    Calculates simple RMSNorm with no learnable weights. `MambaRMSNorm` will
+    leverage this in order to multiply the final result with the RMSNorm weight
+
+    Args:
+        hidden_states (`torch.Tensor`):
+            Hidden states to normalize
+        variance_epsilon (`float`):
+            The eps value to add in the square root scaling factor
+    """
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return hidden_states.to(input_dtype)
+
+
+class FalconMambaMixer(MambaMixer):
+    def warn_slow_implementation(self):
+        is_fast_path_available = all(
+            (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+        )
+        if not is_fast_path_available:
+            if self.use_falcon_mambapy:
+                if is_mambapy_available():
+                    logger.warning_once(
+                        "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                        " is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and"
+                        " https://github.com/Dao-AILab/causal-conv1d"
+                    )
+                else:
+                    raise ImportError(
+                        "use_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py."
+                    )
+            else:
+                logger.warning_once(
+                    "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                    " is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and"
+                    " https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py."
+                )
+
+    def __init__(self, config: FalconMambaConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        # Triton expects to pass RMS weights even if they are non learnable, thus we need to create these weights here
+        self.register_buffer(
+            "b_c_rms", torch.nn.Parameter(torch.ones(self.ssm_state_size), requires_grad=False), persistent=False
+        )
+        self.register_buffer(
+            "dt_rms", torch.nn.Parameter(torch.ones(self.intermediate_size), requires_grad=False), persistent=False
+        )
+        self.rms_eps = config.mixer_rms_eps
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+
+        if self.training and cache_params is None:  # Doesn't support outputting the states -> used for training
+            contextualized_states = mamba_inner_fn(
+                projected_states,
+                self.conv1d.weight,
+                self.conv1d.bias if self.use_conv_bias else None,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias.float() if self.use_bias else None,
+                -torch.exp(self.A_log.float()),
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                b_rms_weight=self.b_c_rms,
+                c_rms_weight=self.b_c_rms,
+                dt_rms_weight=self.dt_rms,
+                b_c_dt_rms_eps=self.rms_eps,
+            )
+
+        else:
+            hidden_states, gate = projected_states.chunk(2, dim=1)
+
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+            # 2. Convolution sequence transformation
+            conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+            if cache_params is not None and cache_position[0] > 0:
+                hidden_states = causal_conv1d_update(
+                    hidden_states.squeeze(-1),
+                    cache_params.conv_states[self.layer_idx],
+                    conv_weights,
+                    self.conv1d.bias,
+                    self.activation,
+                )
+                hidden_states = hidden_states.unsqueeze(-1)
+            else:
+                if cache_params is not None:
+                    conv_states = nn.functional.pad(
+                        hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                    )
+                    cache_params.update_conv_state(self.layer_idx, conv_states, cache_position)
+                hidden_states = causal_conv1d_fn(
+                    hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
+                )
+
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+            # 3. State Space Model sequence transformation
+            # 3.a. input varying initialization of time_step, B and C
+            ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+            time_step, B, C = torch.split(
+                ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+            )
+
+            B = rms_forward(B, variance_epsilon=self.rms_eps)
+            C = rms_forward(C, variance_epsilon=self.rms_eps)
+            time_step = rms_forward(time_step, variance_epsilon=self.rms_eps)
+
+            # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module
+            # at the price of a small overhead.
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2)
+            else:
+                discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
+
+            A = -torch.exp(self.A_log.float())
+            # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+            time_proj_bias = self.dt_proj.bias.float() if hasattr(self.dt_proj, "bias") else None
+            if cache_params is not None and cache_position[0] > 0:
+                scan_outputs = selective_state_update(
+                    cache_params.ssm_states[self.layer_idx],
+                    hidden_states[..., 0],
+                    discrete_time_step[..., 0],
+                    A,
+                    B[:, 0],
+                    C[:, 0],
+                    self.D,
+                    gate[..., 0],
+                    time_proj_bias,
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+            else:
+                scan_outputs, ssm_state = selective_scan_fn(
+                    hidden_states,
+                    discrete_time_step,
+                    A,
+                    B.transpose(1, 2),
+                    C.transpose(1, 2),
+                    self.D.float(),
+                    gate,
+                    time_proj_bias,
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.update_ssm_state(self.layer_idx, ssm_state)
+
+            # 4. Final linear projection
+            contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+        return contextualized_states
+
+    def slow_forward(
+        self,
+        input_states,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(input_states).transpose(1, 2)  # [batch, 2 * intermediate_size, seq_len]
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 2. Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            # use `cache_position.shape[0]` to check whether we are in prefill
+            # stage, it's equivalent to check `cache_position[0] == 0`, which
+            # breaks dynamo fullgraph constraints
+            if cache_position is not None and cache_position.shape[0] == self.conv_kernel_size:
+                conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+
+                cache_params.update_conv_state(self.layer_idx, conv_state, cache_position)
+                hidden_states = self.act(
+                    self.conv1d(hidden_states)[..., :seq_len]
+                )  # [batch, intermediate_size, seq_len]
+            else:
+                conv_state = cache_params.update_conv_state(self.layer_idx, hidden_states, cache_position)
+                conv_state = conv_state.to(self.conv1d.weight.device)
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = (
+                    self.act(hidden_states).to(dtype).unsqueeze(-1)
+                )  # [batch, intermediate_size, 1] : decoding
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.intermediate_size, self.ssm_state_size), device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])  # [batch, intermediate_size, seq_len]
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+
+        B = rms_forward(B, variance_epsilon=self.rms_eps)
+        C = rms_forward(C, variance_epsilon=self.rms_eps)
+        time_step = rms_forward(time_step, variance_epsilon=self.rms_eps)
+
+        discrete_time_step = self.dt_proj(time_step)  # [batch, seq_len, intermediate_size]
+        discrete_time_step = nn.functional.softplus(discrete_time_step).transpose(
+            1, 2
+        )  # [batch, intermediate_size, seq_len]
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -torch.exp(self.A_log.float())  # [intermediate_size, ssm_state_size]
+        discrete_A = torch.exp(
+            A[None, :, None, :] * discrete_time_step[:, :, :, None]
+        )  # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_B = (
+            discrete_time_step[:, :, :, None] * B[:, None, :, :].float()
+        )  # [batch, intermediate_size, seq_len, ssm_state_size]
+        deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
+
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        if self.use_falcon_mambapy and self.training and cache_params is None:
+            hs = pscan(
+                discrete_A.transpose(1, 2), deltaB_u.transpose(1, 2)
+            )  # [batch, seq_len, intermediate_size, ssm_state_size]
+            scan_output = (hs @ C.unsqueeze(-1)).squeeze(3).transpose(1, 2)  # [batch, intermediate_size, seq_len]
+            scan_output = scan_output + hidden_states * self.D[None, :, None]
+            scan_output = scan_output * self.act(gate)
+        else:
+            scan_outputs = []
+            for i in range(seq_len):
+                ssm_state = (
+                    discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]
+                )  # [batch, intermediate_size, ssm_state]
+                scan_output = torch.matmul(
+                    ssm_state.to(dtype), C[:, i, :].unsqueeze(-1)
+                )  # [batch, intermediate_size, 1]
+                scan_outputs.append(scan_output[:, :, 0])
+            scan_output = torch.stack(scan_outputs, dim=-1)  # [batch, intermediate_size, seq_len]
+            scan_output = scan_output + (hidden_states * self.D[None, :, None])
+            scan_output = scan_output * self.act(gate)
+
+            if cache_params is not None:
+                cache_params.update_ssm_state(self.layer_idx, ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.transpose(1, 2))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[FalconMambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        is_fast_path_available = all(
+            (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+        )
+        if is_fast_path_available and "cuda" in self.x_proj.weight.device.type and not torch._dynamo.is_compiling():
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask)
+
+
+class FalconMambaRMSNorm(MambaRMSNorm):
+    def forward(self, hidden_states):
+        return self.weight.to(hidden_states.device) * rms_forward(
+            hidden_states, variance_epsilon=self.variance_epsilon
+        )
+
+
+class FalconMambaBlock(MambaBlock):
+    pass
+
+
+@auto_docstring
+class FalconMambaPreTrainedModel(MambaPreTrainedModel):
+    pass
+
+
+class FalconMambaOutput(MambaOutput):
+    pass
+
+
+class FalconMambaCausalLMOutput(MambaCausalLMOutput):
+    pass
+
+
+class FalconMambaModel(MambaModel, FalconMambaPreTrainedModel):
+    def __init__(self, config):
+        FalconMambaPreTrainedModel.__init__(self, config)
+
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [FalconMambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]
+        )
+
+        self.gradient_checkpointing = False
+        self.norm_f = FalconMambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def load_hook(self, state_dict, prefix, *args):
+        raise AttributeError("Not needed for FalconMamba")
+
+
+class FalconMambaForCausalLM(MambaForCausalLM):
+    pass
+
+
+__all__ = [
+    "FalconMambaForCausalLM",
+    "FalconMambaModel",
+    "FalconMambaPreTrainedModel",
+    "FalconMambaCache",
+    "FalconMambaConfig",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__init__.py b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..44d1ec7236310774ed6b1379683c144d7f93ecce
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_fastspeech2_conformer import *
+    from .modeling_fastspeech2_conformer import *
+    from .tokenization_fastspeech2_conformer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b0e1eef4ab4ddbbbd2efb577ae65fe5b96aaf73
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d90fc1a4583104f05283eaf27cbdae5c0a332f59
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45e9f102d5419aca0cdab970e06c3ecb5333e4b6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c8c586db8392ade16617baffe82e8e4dfbd1b97
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..89d65a261c64fbabe493aa37677aaacb6f226a3b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
@@ -0,0 +1,480 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FastSpeech2Conformer model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FastSpeech2ConformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to
+    instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 384):
+            The dimensionality of the hidden layers.
+        vocab_size (`int`, *optional*, defaults to 78):
+            The size of the vocabulary.
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            The number of mel filters used in the filter bank.
+        encoder_num_attention_heads (`int`, *optional*, defaults to 2):
+            The number of attention heads in the encoder.
+        encoder_layers (`int`, *optional*, defaults to 4):
+            The number of layers in the encoder.
+        encoder_linear_units (`int`, *optional*, defaults to 1536):
+            The number of units in the linear layer of the encoder.
+        decoder_layers (`int`, *optional*, defaults to 4):
+            The number of layers in the decoder.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 2):
+            The number of attention heads in the decoder.
+        decoder_linear_units (`int`, *optional*, defaults to 1536):
+            The number of units in the linear layer of the decoder.
+        speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
+            The number of layers in the post-net of the speech decoder.
+        speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
+            The number of units in the post-net layers of the speech decoder.
+        speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
+            The kernel size in the post-net of the speech decoder.
+        positionwise_conv_kernel_size (`int`, *optional*, defaults to 3):
+            The size of the convolution kernel used in the position-wise layer.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Specifies whether to normalize before encoder layers.
+        decoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Specifies whether to normalize before decoder layers.
+        encoder_concat_after (`bool`, *optional*, defaults to `False`):
+            Specifies whether to concatenate after encoder layers.
+        decoder_concat_after (`bool`, *optional*, defaults to `False`):
+            Specifies whether to concatenate after decoder layers.
+        reduction_factor (`int`, *optional*, defaults to 1):
+            The factor by which the speech frame rate is reduced.
+        speaking_speed (`float`, *optional*, defaults to 1.0):
+            The speed of the speech produced.
+        use_macaron_style_in_conformer (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use macaron style in the conformer.
+        use_cnn_in_conformer (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use convolutional neural networks in the conformer.
+        encoder_kernel_size (`int`, *optional*, defaults to 7):
+            The kernel size used in the encoder.
+        decoder_kernel_size (`int`, *optional*, defaults to 31):
+            The kernel size used in the decoder.
+        duration_predictor_layers (`int`, *optional*, defaults to 2):
+            The number of layers in the duration predictor.
+        duration_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the duration predictor.
+        duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size used in the duration predictor.
+        energy_predictor_layers (`int`, *optional*, defaults to 2):
+            The number of layers in the energy predictor.
+        energy_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the energy predictor.
+        energy_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size used in the energy predictor.
+        energy_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the energy predictor.
+        energy_embed_kernel_size (`int`, *optional*, defaults to 1):
+            The kernel size used in the energy embed layer.
+        energy_embed_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout rate in the energy embed layer.
+        stop_gradient_from_energy_predictor (`bool`, *optional*, defaults to `False`):
+            Specifies whether to stop gradients from the energy predictor.
+        pitch_predictor_layers (`int`, *optional*, defaults to 5):
+            The number of layers in the pitch predictor.
+        pitch_predictor_channels (`int`, *optional*, defaults to 256):
+            The number of channels in the pitch predictor.
+        pitch_predictor_kernel_size (`int`, *optional*, defaults to 5):
+            The kernel size used in the pitch predictor.
+        pitch_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the pitch predictor.
+        pitch_embed_kernel_size (`int`, *optional*, defaults to 1):
+            The kernel size used in the pitch embed layer.
+        pitch_embed_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout rate in the pitch embed layer.
+        stop_gradient_from_pitch_predictor (`bool`, *optional*, defaults to `True`):
+            Specifies whether to stop gradients from the pitch predictor.
+        encoder_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the encoder.
+        encoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The positional dropout rate in the encoder.
+        encoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The attention dropout rate in the encoder.
+        decoder_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the decoder.
+        decoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The positional dropout rate in the decoder.
+        decoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The attention dropout rate in the decoder.
+        duration_predictor_dropout_rate (`float`, *optional*, defaults to 0.2):
+            The dropout rate in the duration predictor.
+        speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout rate in the speech decoder postnet.
+        max_source_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        use_masking (`bool`, *optional*, defaults to `True`):
+            Specifies whether to use masking in the model.
+        use_weighted_masking (`bool`, *optional*, defaults to `False`):
+            Specifies whether to use weighted masking in the model.
+        num_speakers (`int`, *optional*):
+            Number of speakers. If set to > 1, assume that the speaker ids will be provided as the input and use
+            speaker id embedding layer.
+        num_languages (`int`, *optional*):
+            Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
+            language id embedding layer.
+        speaker_embed_dim (`int`, *optional*):
+            Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Specifies whether the model is an encoder-decoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig
+
+    >>> # Initializing a FastSpeech2Conformer style configuration
+    >>> configuration = FastSpeech2ConformerConfig()
+
+    >>> # Initializing a model from the FastSpeech2Conformer style configuration
+    >>> model = FastSpeech2ConformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "fastspeech2_conformer"
+    base_config_key = "model_config"
+    attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
+
+    def __init__(
+        self,
+        hidden_size=384,
+        vocab_size=78,
+        num_mel_bins=80,
+        encoder_num_attention_heads=2,
+        encoder_layers=4,
+        encoder_linear_units=1536,
+        decoder_layers=4,
+        decoder_num_attention_heads=2,
+        decoder_linear_units=1536,
+        speech_decoder_postnet_layers=5,
+        speech_decoder_postnet_units=256,
+        speech_decoder_postnet_kernel=5,
+        positionwise_conv_kernel_size=3,
+        encoder_normalize_before=False,
+        decoder_normalize_before=False,
+        encoder_concat_after=False,
+        decoder_concat_after=False,
+        reduction_factor=1,
+        speaking_speed=1.0,
+        use_macaron_style_in_conformer=True,
+        use_cnn_in_conformer=True,
+        encoder_kernel_size=7,
+        decoder_kernel_size=31,
+        duration_predictor_layers=2,
+        duration_predictor_channels=256,
+        duration_predictor_kernel_size=3,
+        energy_predictor_layers=2,
+        energy_predictor_channels=256,
+        energy_predictor_kernel_size=3,
+        energy_predictor_dropout=0.5,
+        energy_embed_kernel_size=1,
+        energy_embed_dropout=0.0,
+        stop_gradient_from_energy_predictor=False,
+        pitch_predictor_layers=5,
+        pitch_predictor_channels=256,
+        pitch_predictor_kernel_size=5,
+        pitch_predictor_dropout=0.5,
+        pitch_embed_kernel_size=1,
+        pitch_embed_dropout=0.0,
+        stop_gradient_from_pitch_predictor=True,
+        encoder_dropout_rate=0.2,
+        encoder_positional_dropout_rate=0.2,
+        encoder_attention_dropout_rate=0.2,
+        decoder_dropout_rate=0.2,
+        decoder_positional_dropout_rate=0.2,
+        decoder_attention_dropout_rate=0.2,
+        duration_predictor_dropout_rate=0.2,
+        speech_decoder_postnet_dropout=0.5,
+        max_source_positions=5000,
+        use_masking=True,
+        use_weighted_masking=False,
+        num_speakers=None,
+        num_languages=None,
+        speaker_embed_dim=None,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        if positionwise_conv_kernel_size % 2 == 0:
+            raise ValueError(
+                f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead."
+            )
+        if encoder_kernel_size % 2 == 0:
+            raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.")
+        if decoder_kernel_size % 2 == 0:
+            raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.")
+        if duration_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead."
+            )
+        if energy_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead."
+            )
+        if energy_embed_kernel_size % 2 == 0:
+            raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.")
+        if pitch_predictor_kernel_size % 2 == 0:
+            raise ValueError(
+                f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead."
+            )
+        if pitch_embed_kernel_size % 2 == 0:
+            raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.")
+        if hidden_size % encoder_num_attention_heads != 0:
+            raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.")
+        if hidden_size % decoder_num_attention_heads != 0:
+            raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.")
+        if use_masking and use_weighted_masking:
+            raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.encoder_config = {
+            "num_attention_heads": encoder_num_attention_heads,
+            "layers": encoder_layers,
+            "kernel_size": encoder_kernel_size,
+            "attention_dropout_rate": encoder_attention_dropout_rate,
+            "dropout_rate": encoder_dropout_rate,
+            "positional_dropout_rate": encoder_positional_dropout_rate,
+            "linear_units": encoder_linear_units,
+            "normalize_before": encoder_normalize_before,
+            "concat_after": encoder_concat_after,
+        }
+        self.decoder_config = {
+            "num_attention_heads": decoder_num_attention_heads,
+            "layers": decoder_layers,
+            "kernel_size": decoder_kernel_size,
+            "attention_dropout_rate": decoder_attention_dropout_rate,
+            "dropout_rate": decoder_dropout_rate,
+            "positional_dropout_rate": decoder_positional_dropout_rate,
+            "linear_units": decoder_linear_units,
+            "normalize_before": decoder_normalize_before,
+            "concat_after": decoder_concat_after,
+        }
+        self.encoder_num_attention_heads = encoder_num_attention_heads
+        self.encoder_layers = encoder_layers
+        self.duration_predictor_channels = duration_predictor_channels
+        self.duration_predictor_kernel_size = duration_predictor_kernel_size
+        self.duration_predictor_layers = duration_predictor_layers
+        self.energy_embed_dropout = energy_embed_dropout
+        self.energy_embed_kernel_size = energy_embed_kernel_size
+        self.energy_predictor_channels = energy_predictor_channels
+        self.energy_predictor_dropout = energy_predictor_dropout
+        self.energy_predictor_kernel_size = energy_predictor_kernel_size
+        self.energy_predictor_layers = energy_predictor_layers
+        self.pitch_embed_dropout = pitch_embed_dropout
+        self.pitch_embed_kernel_size = pitch_embed_kernel_size
+        self.pitch_predictor_channels = pitch_predictor_channels
+        self.pitch_predictor_dropout = pitch_predictor_dropout
+        self.pitch_predictor_kernel_size = pitch_predictor_kernel_size
+        self.pitch_predictor_layers = pitch_predictor_layers
+        self.positionwise_conv_kernel_size = positionwise_conv_kernel_size
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout
+        self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.reduction_factor = reduction_factor
+        self.speaking_speed = speaking_speed
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.max_source_positions = max_source_positions
+        self.use_cnn_in_conformer = use_cnn_in_conformer
+        self.use_macaron_style_in_conformer = use_macaron_style_in_conformer
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+        self.num_speakers = num_speakers
+        self.num_languages = num_languages
+        self.speaker_embed_dim = speaker_embed_dim
+        self.duration_predictor_dropout_rate = duration_predictor_dropout_rate
+        self.is_encoder_decoder = is_encoder_decoder
+
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+
+class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to
+    instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2Conformer
+    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        model_in_dim (`int`, *optional*, defaults to 80):
+            The number of frequency bins in the input log-mel spectrogram.
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the upsampling network.
+        upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*.
+        upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
+            *upsample_rates*.
+        resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
+            fusion (MRF) module.
+        resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            multi-receptive field fusion (MRF) module.
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation.
+        normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
+
+    Example:
+
+    ```python
+    >>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig
+
+    >>> # Initializing a FastSpeech2ConformerHifiGan configuration
+    >>> configuration = FastSpeech2ConformerHifiGanConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = FastSpeech2ConformerHifiGan(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "hifigan"
+    base_config_key = "vocoder_config"
+
+    def __init__(
+        self,
+        model_in_dim=80,
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        initializer_range=0.01,
+        leaky_relu_slope=0.1,
+        normalize_before=True,
+        **kwargs,
+    ):
+        self.model_in_dim = model_in_dim
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.initializer_range = initializer_range
+        self.leaky_relu_slope = leaky_relu_slope
+        self.normalize_before = normalize_before
+        super().__init__(**kwargs)
+
+
+class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to
+    instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations,
+    defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and
+    FastSpeech2ConformerHifiGan
+    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        model_config (`typing.Dict`, *optional*):
+            Configuration of the text-to-speech model.
+        vocoder_config (`typing.Dict`, *optional*):
+            Configuration of the vocoder model.
+    model_config ([`FastSpeech2ConformerConfig`], *optional*):
+        Configuration of the text-to-speech model.
+    vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*):
+        Configuration of the vocoder model.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     FastSpeech2ConformerConfig,
+    ...     FastSpeech2ConformerHifiGanConfig,
+    ...     FastSpeech2ConformerWithHifiGanConfig,
+    ...     FastSpeech2ConformerWithHifiGan,
+    ... )
+
+    >>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations.
+    >>> model_config = FastSpeech2ConformerConfig()
+    >>> vocoder_config = FastSpeech2ConformerHifiGanConfig()
+
+    >>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration
+    >>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict())
+
+    >>> # Initializing a model (with random weights)
+    >>> model = FastSpeech2ConformerWithHifiGan(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "fastspeech2_conformer_with_hifigan"
+    sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig}
+
+    def __init__(
+        self,
+        model_config: Optional[dict] = None,
+        vocoder_config: Optional[dict] = None,
+        **kwargs,
+    ):
+        if model_config is None:
+            model_config = {}
+            logger.info("model_config is None. initializing the model with default values.")
+
+        if vocoder_config is None:
+            vocoder_config = {}
+            logger.info("vocoder_config is None. initializing the coarse model with default values.")
+
+        self.model_config = FastSpeech2ConformerConfig(**model_config)
+        self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config)
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["FastSpeech2ConformerConfig", "FastSpeech2ConformerHifiGanConfig", "FastSpeech2ConformerWithHifiGanConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b038a93396de6711918c53cbfdc1797134a7da6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
@@ -0,0 +1,1571 @@
+# coding=utf-8
+# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FastSpeech2Conformer model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from .configuration_fastspeech2_conformer import (
+    FastSpeech2ConformerConfig,
+    FastSpeech2ConformerHifiGanConfig,
+    FastSpeech2ConformerWithHifiGanConfig,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`FastSpeech2ConformerModel`].
+    """
+)
+class FastSpeech2ConformerModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Spectrogram generation loss.
+    duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
+        Outputs of the duration predictor.
+    pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+        Outputs of the pitch predictor.
+    energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+        Outputs of the energy predictor.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    spectrogram: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    duration_outputs: Optional[torch.LongTensor] = None
+    pitch_outputs: Optional[torch.FloatTensor] = None
+    energy_outputs: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`FastSpeech2ConformerWithHifiGan`].
+    """
+)
+class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Spectrogram generation loss.
+    duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
+        Outputs of the duration predictor.
+    pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+        Outputs of the pitch predictor.
+    energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
+        Outputs of the energy predictor.
+    waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
+        Speech output as a result of passing the predicted mel spectrogram through the vocoder.
+    """
+
+    waveform: Optional[torch.FloatTensor] = None
+
+
+def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0):
+    """
+    Length regulator for feed-forward Transformer.
+
+    This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech`
+    https://huggingface.co/papers/1905.09263. The length regulator expands char or phoneme-level embedding features to
+    frame-level by repeating each feature based on the corresponding predicted durations.
+
+    Args:
+        encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`):
+            Batch of sequences of char or phoneme embeddings.
+        duration_labels (`torch.LongTensor` of shape `(batch_size, time)`):
+            Batch of durations of each frame.
+        speaking_speed (`float`, *optional*, defaults to 1.0):
+            Value to control speed of speech.
+
+    Returns:
+        `torch.Tensor`:
+            Replicated input tensor based on durations (batch_size, time*, embedding_dim).
+    """
+
+    if speaking_speed <= 0:
+        raise ValueError("`speaking_speed` must be greater than 0.")
+    elif speaking_speed != 1.0:
+        duration_labels = torch.round(duration_labels.float() * speaking_speed).long()
+
+    if duration_labels.sum() == 0:
+        duration_labels[duration_labels.sum(dim=1).eq(0)] = 1
+
+    # Calculate the maximum length needed
+    max_len = torch.sum(duration_labels, dim=1).max()
+
+    # Create a padded tensor to hold the results
+    hidden_states = torch.zeros(
+        (encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)),
+        dtype=torch.float,
+        device=encoded_embeddings.device,
+    )
+
+    # Loop through the batch and fill in the data
+    for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)):
+        repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0)
+        hidden_states[i, : repeated.size(0)] = repeated
+
+    return hidden_states
+
+
+class FastSpeech2ConformerDurationPredictor(nn.Module):
+    """
+    Duration predictor module.
+
+    This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to
+    Speech' https://huggingface.co/papers/1905.09263 The duration predictor predicts a duration of each frame in log domain
+    from the hidden embeddings of encoder.
+
+    Note:
+        The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the
+        outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
+
+    """
+
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__()
+
+        self.conv_layers = nn.ModuleList()
+        self.log_domain_offset = 1.0
+
+        for layer_idx in range(config.duration_predictor_layers):
+            num_chans = config.duration_predictor_channels
+            input_channels = config.hidden_size if layer_idx == 0 else num_chans
+            layer = FastSpeech2ConformerPredictorLayer(
+                input_channels,
+                num_chans,
+                config.duration_predictor_kernel_size,
+                config.duration_predictor_dropout_rate,
+            )
+            self.conv_layers.append(layer)
+        self.linear = nn.Linear(config.duration_predictor_channels, 1)
+
+    def forward(self, encoder_hidden_states):
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
+                Batch of input sequences.
+            padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
+                Batch of masks indicating padded part.
+
+        Returns:
+            `torch.Tensor`: Batch of predicted durations in log domain `(batch_size, max_text_length)`.
+
+        """
+        # (batch_size, input_dim, max_text_length)
+        hidden_states = encoder_hidden_states.transpose(1, -1)
+        for layer in self.conv_layers:
+            hidden_states = layer(hidden_states)
+
+        # NOTE: calculate in log domain, (batch_size, max_text_length)
+        hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1)
+
+        if not self.training:
+            # NOTE: calculate in linear domain
+            hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long()
+
+        return hidden_states
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer
+class FastSpeech2ConformerBatchNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+
+        if layer_id == 0:
+            in_conv_dim = config.num_mel_bins
+        else:
+            in_conv_dim = config.speech_decoder_postnet_units
+
+        if layer_id == config.speech_decoder_postnet_layers - 1:
+            out_conv_dim = config.num_mel_bins
+        else:
+            out_conv_dim = config.speech_decoder_postnet_units
+
+        self.conv = nn.Conv1d(
+            in_conv_dim,
+            out_conv_dim,
+            kernel_size=config.speech_decoder_postnet_kernel,
+            stride=1,
+            padding=(config.speech_decoder_postnet_kernel - 1) // 2,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(out_conv_dim)
+
+        if layer_id < config.speech_decoder_postnet_layers - 1:
+            self.activation = nn.Tanh()
+        else:
+            self.activation = None
+
+        self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        if self.activation is not None:
+            hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
+        self.layers = nn.ModuleList(
+            [FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
+        layer_output = outputs_before_postnet.transpose(1, 2)
+        for layer in self.layers:
+            layer_output = layer(layer_output)
+        outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2)
+        return outputs_before_postnet, outputs_after_postnet
+
+
+class FastSpeech2ConformerPredictorLayer(nn.Module):
+    def __init__(self, input_channels, num_chans, kernel_size, dropout_rate):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            input_channels,
+            num_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.activation = nn.ReLU()
+        self.layer_norm = nn.LayerNorm(num_chans)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        # Perform layer norm on dimension 1
+        hidden_states = hidden_states.transpose(1, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, -1)
+
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class FastSpeech2ConformerVariancePredictor(nn.Module):
+    def __init__(
+        self,
+        config: FastSpeech2ConformerConfig,
+        num_layers=2,
+        num_chans=384,
+        kernel_size=3,
+        dropout_rate=0.5,
+    ):
+        """
+        Initialize variance predictor module.
+
+        Args:
+            input_dim (`int`): Input dimension.
+            num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers.
+            num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers.
+            kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers.
+            dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate.
+        """
+        super().__init__()
+        self.conv_layers = nn.ModuleList()
+        for idx in range(num_layers):
+            input_channels = config.hidden_size if idx == 0 else num_chans
+            layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate)
+            self.conv_layers.append(layer)
+        self.linear = nn.Linear(num_chans, 1)
+
+    def forward(self, encoder_hidden_states, padding_masks=None):
+        """
+        Calculate forward propagation.
+
+        Args:
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
+                Batch of input sequences.
+            padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
+                Batch of masks indicating padded part.
+
+        Returns:
+            Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`.
+        """
+        # (batch_size, input_dim, max_text_length)
+        hidden_states = encoder_hidden_states.transpose(1, -1)
+        for layer in self.conv_layers:
+            hidden_states = layer(hidden_states)
+
+        hidden_states = self.linear(hidden_states.transpose(1, 2))
+
+        if padding_masks is not None:
+            hidden_states = hidden_states.masked_fill(padding_masks, 0.0)
+
+        return hidden_states
+
+
+class FastSpeech2ConformerVarianceEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=384,
+        kernel_size=1,
+        padding=0,
+        dropout_rate=0.0,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+        )
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class FastSpeech2ConformerAttention(nn.Module):
+    """
+    Multi-Head attention layer with relative position encoding. Details can be found in
+    https://github.com/espnet/espnet/pull/2816. Paper: https://huggingface.co/papers/1901.02860.
+    """
+
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """Construct an FastSpeech2ConformerAttention object."""
+        super().__init__()
+        # We assume d_v always equals dim_key
+        self.num_heads = module_config["num_attention_heads"]
+        self.hidden_size = config.hidden_size
+        self.dim_key = self.hidden_size // self.num_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.linear_q = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_k = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_v = nn.Linear(self.hidden_size, self.hidden_size)
+        self.linear_out = nn.Linear(self.hidden_size, self.hidden_size)
+        self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"])
+
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+
+    def shift_relative_position_tensor(self, pos_tensor):
+        """
+        Args:
+            pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor.
+        """
+        zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype)
+        pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1)
+
+        pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2))
+        # only keep the positions from 0 to time2
+        pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1]
+
+        return pos_tensor
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        pos_emb: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time2, size)`): Values of the hidden states
+            attention_mask (`torch.Tensor` of shape `(batch, time1, time2)`): Mask tensor.
+            pos_emb (`torch.Tensor` of shape `(batch, 2*time1-1, size)`): Positional embedding tensor.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time1, d_model)`.
+        """
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.linear_q(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        key_states = self.linear_k(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+        value_states = self.linear_v(hidden_states).view(bsz, -1, self.num_heads, self.head_dim)
+
+        bsz_pos = pos_emb.size(0)
+        pos_encoding = self.linear_pos(pos_emb).view(bsz_pos, -1, self.num_heads, self.head_dim)
+
+        # (batch_size, head, time1, dim_key)
+        query_with_bias_u = (query_states + self.pos_bias_u).transpose(1, 2)
+        # (batch_size, head, time1, dim_key)
+        query_with_bias_v = (query_states + self.pos_bias_v).transpose(1, 2)
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+        # (batch_size, head, time1, time2)
+        matrix_ac = torch.matmul(query_with_bias_u, key_states.permute(0, 2, 3, 1))
+
+        # compute matrix b and matrix d
+        # (batch_size, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(query_with_bias_v, pos_encoding.permute(0, 2, 3, 1))
+        matrix_bd = self.shift_relative_position_tensor(matrix_bd)
+
+        # (batch_size, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.dim_key)
+
+        # Forward attention
+        if attention_mask is not None:
+            expected_size = (bsz, 1, q_len)
+            if attention_mask.size() != expected_size:
+                raise ValueError(f"Attention mask should be of size {expected_size}, but is {attention_mask.size()}")
+            attention_mask = attention_mask.unsqueeze(1).eq(0)
+            min_value = float(torch.finfo(scores.dtype).min)
+            scores = scores.masked_fill(attention_mask, min_value)
+            attn_weights = torch.softmax(scores, dim=-1).masked_fill(attention_mask, 0.0)
+        else:
+            attn_weights = torch.softmax(scores, dim=-1)
+
+        attn_weights = self.dropout(attn_weights)
+        attn_output = torch.matmul(attn_weights, value_states.transpose(1, 2))
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1)
+
+        attn_output = self.linear_out(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class FastSpeech2ConformerConvolutionModule(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        super().__init__()
+        # kernel_size should be an odd number for 'SAME' padding
+        channels = config.hidden_size
+        kernel_size = module_config["kernel_size"]
+        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
+        self.depthwise_conv = nn.Conv1d(
+            channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)
+
+    def forward(self, hidden_states):
+        """
+        Compute convolution module.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
+
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
+
+        """
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism, (batch_size, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # (batch_size, channel, dim)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = hidden_states * torch.sigmoid(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+
+        return hidden_states.transpose(1, 2)
+
+
+class FastSpeech2ConformerEncoderLayer(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        super().__init__()
+
+        # self-attention module definition
+        self.self_attn = FastSpeech2ConformerAttention(config, module_config)
+
+        # feed-forward module definition
+        self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
+
+        self.macaron_style = config.use_macaron_style_in_conformer
+        if self.macaron_style:
+            self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
+            self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+
+        # convolution module definition
+        self.use_cnn_module = config.use_cnn_in_conformer
+        if self.use_cnn_module:
+            self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config)
+            self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
+            self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.ff_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.dropout = nn.Dropout(module_config["dropout_rate"])
+        self.size = config.hidden_size
+        self.normalize_before = module_config["normalize_before"]
+        self.concat_after = module_config["concat_after"]
+        if self.concat_after:
+            self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pos_emb: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ):
+        """
+        Compute encoded features.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch, time, size)`): Input tensor.
+            pos_emb (`torch.Tensor` of shape `(1, time, size)`): Positional embeddings tensor.
+            attention_mask (`torch.Tensor` of shape `(batch, time)`): Attention mask tensor for the input.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            `torch.Tensor`: Output tensor of shape `(batch, time, size)`.
+
+        """
+        # whether to use macaron style
+        if self.macaron_style:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.ff_macaron_layer_norm(hidden_states)
+            hidden_states = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(hidden_states))
+            if not self.normalize_before:
+                hidden_states = self.ff_macaron_layer_norm(hidden_states)
+
+        # multi-headed self-attention module
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        attention_output, attention_scores = self.self_attn(
+            hidden_states, attention_mask=attention_mask, pos_emb=pos_emb, output_attentions=output_attentions
+        )
+
+        if self.concat_after:
+            x_concat = torch.cat((hidden_states, attention_output), dim=-1)
+            hidden_states = self.concat_linear(x_concat)
+            hidden_states = residual + hidden_states
+        else:
+            hidden_states = self.dropout(attention_output)
+            hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # convolution module
+        if self.use_cnn_module:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.conv_layer_norm(hidden_states)
+            hidden_states = self.conv_module(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = residual + hidden_states
+            if not self.normalize_before:
+                hidden_states = self.conv_layer_norm(hidden_states)
+
+        # feed forward module
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.ff_layer_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + self.ff_scale * hidden_states
+        if not self.normalize_before:
+            hidden_states = self.ff_layer_norm(hidden_states)
+
+        if self.conv_module is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_scores,)
+
+        return outputs
+
+
+class FastSpeech2ConformerMultiLayeredConv1d(nn.Module):
+    """
+    Multi-layered conv1d for Transformer block.
+
+    This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer
+    block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech'
+    https://huggingface.co/papers/1905.09263
+    """
+
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """
+        Initialize FastSpeech2ConformerMultiLayeredConv1d module.
+
+        Args:
+            input_channels (`int`): Number of input channels.
+            hidden_channels (`int`): Number of hidden channels.
+            kernel_size (`int`): Kernel size of conv1d.
+            dropout_rate (`float`): Dropout rate.
+        """
+        super().__init__()
+        input_channels = config.hidden_size
+        hidden_channels = module_config["linear_units"]
+        kernel_size = config.positionwise_conv_kernel_size
+        self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+        self.dropout = nn.Dropout(module_config["dropout_rate"])
+
+    def forward(self, hidden_states):
+        """
+        Calculate forward propagation.
+
+        Args:
+            hidden_states (torch.Tensor): Batch of input tensors (batch_size, time, input_channels).
+
+        Returns:
+            torch.Tensor: Batch of output tensors (batch_size, time, hidden_channels).
+        """
+        hidden_states = hidden_states.transpose(-1, 1)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = torch.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = hidden_states.transpose(-1, 1)
+        return hidden_states
+
+
+class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
+    """
+    Args:
+    Relative positional encoding module (new implementation). Details can be found in
+    https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://huggingface.co/papers/1901.02860
+        config (`FastSpeech2ConformerConfig`):
+            FastSpeech2ConformerConfig instance.
+        module_config (`dict`):
+            Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
+    """
+
+    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
+        """
+        Construct an PositionalEncoding object.
+        """
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.input_scale = math.sqrt(self.embed_dim)
+        self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"])
+        self.pos_enc = None
+        self.max_len = 5000
+        self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pos_enc(self, x):
+        """Reset the positional encodings."""
+        if self.pos_enc is not None:
+            # self.pos_enc contains both positive and negative parts
+            # the length of self.pos_enc is 2 * input_len - 1
+            if self.pos_enc.size(1) >= x.size(1) * 2 - 1:
+                if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device:
+                    self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vector and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pos_enc_positive = torch.zeros(x.size(1), self.embed_dim)
+        pos_enc_negative = torch.zeros(x.size(1), self.embed_dim)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.embed_dim, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.embed_dim)
+        )
+        pos_enc_positive[:, 0::2] = torch.sin(position * div_term)
+        pos_enc_positive[:, 1::2] = torch.cos(position * div_term)
+        pos_enc_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pos_enc_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://huggingface.co/papers/1901.02860
+        pos_enc_positive = torch.flip(pos_enc_positive, [0]).unsqueeze(0)
+        pos_enc_negative = pos_enc_negative[1:].unsqueeze(0)
+        pos_enc = torch.cat([pos_enc_positive, pos_enc_negative], dim=1)
+        self.pos_enc = pos_enc.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, feature_representation):
+        """
+        Args:
+            feature_representation (`torch.Tensor` of shape (batch_size, time, `*`)):
+                Input tensor.
+
+        Returns:
+            `torch.Tensor`: Encoded tensor (batch_size, time, `*`).
+        """
+        self.extend_pos_enc(feature_representation)
+        hidden_states = feature_representation * self.input_scale
+        center_idx = self.pos_enc.size(1) // 2
+        pos_emb = self.pos_enc[:, center_idx - hidden_states.size(1) + 1 : center_idx + hidden_states.size(1)]
+        return self.dropout(hidden_states), self.dropout(pos_emb)
+
+
+class FastSpeech2ConformerEncoder(nn.Module):
+    """
+    FastSpeech2ConformerEncoder encoder module.
+
+    Args:
+        config (`FastSpeech2ConformerConfig`):
+            FastSpeech2ConformerConfig instance.
+        module_config (`dict`):
+            Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
+        use_encoder_input_layer (`bool`, *optional*, defaults to `False`):
+            Input layer type.
+    """
+
+    def __init__(
+        self,
+        config: FastSpeech2ConformerConfig,
+        module_config,
+        use_encoder_input_layer=False,
+    ):
+        super().__init__()
+
+        self.embed = None
+        if use_encoder_input_layer:
+            self.embed = nn.Embedding(
+                num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=0
+            )
+
+        self.pos_enc = FastSpeech2ConformerRelPositionalEncoding(config, module_config)
+
+        self.conformer_layers = nn.ModuleList(
+            [FastSpeech2ConformerEncoderLayer(config, module_config) for _ in range(module_config["layers"])]
+        )
+
+    def forward(
+        self,
+        input_tensor: torch.LongTensor,
+        attention_mask: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        Returns:
+            `torch.Tensor`:
+                Output tensor of shape `(batch, time, attention_dim)`.
+        """
+        feature_representation = input_tensor
+        if self.embed is not None:
+            feature_representation = self.embed(feature_representation)
+
+        hidden_states, pos_emb = self.pos_enc(feature_representation)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for conformer_layer in self.conformer_layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = conformer_layer(hidden_states, pos_emb, attention_mask, output_attentions)
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class FastSpeech2ConformerLoss(nn.Module):
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__()
+
+        use_masking = config.use_masking
+        use_weighted_masking = config.use_weighted_masking
+
+        if use_masking and use_weighted_masking:
+            raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
+
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.duration_criterion = nn.MSELoss(reduction=reduction)
+        self.log_domain_offset = 1.0
+
+    def forward(
+        self,
+        outputs_after_postnet,
+        outputs_before_postnet,
+        duration_outputs,
+        pitch_outputs,
+        energy_outputs,
+        spectrogram_labels,
+        duration_labels,
+        pitch_labels,
+        energy_labels,
+        duration_mask,
+        spectrogram_mask,
+    ):
+        """
+        Args:
+            outputs_after_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of outputs after postnet.
+            outputs_before_postnet (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of outputs before postnet.
+            duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length)`):
+                Batch of outputs of duration predictor.
+            pitch_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of outputs of pitch predictor.
+            energy_outputs (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of outputs of energy predictor.
+            spectrogram_labels (`torch.Tensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`):
+                Batch of target features.
+            duration_labels (`torch.LongTensor` of shape `(batch_size, max_text_length)`): Batch of durations.
+            pitch_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of target token-averaged pitch.
+            energy_labels (`torch.Tensor` of shape `(batch_size, max_text_length, 1)`):
+                Batch of target token-averaged energy.
+            duration_mask (`torch.LongTensor`):
+                Mask used to discern which values the duration loss should be calculated for.
+            spectrogram_mask (`torch.LongTensor`):
+                Mask used to discern which values the spectrogam loss should be calculated for.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: Tuple of tensors containing, in order, the L1 loss value, duration predictor
+            loss value, pitch predictor loss value, and energy predictor loss value.
+
+        """
+        pitch_and_energy_masks = duration_mask.unsqueeze(-1)
+
+        # apply mask to remove padded part
+        if self.use_masking:
+            outputs_before_postnet = outputs_before_postnet.masked_select(spectrogram_mask)
+            if outputs_after_postnet is not None:
+                outputs_after_postnet = outputs_after_postnet.masked_select(spectrogram_mask)
+            spectrogram_labels = spectrogram_labels.masked_select(spectrogram_mask)
+            duration_outputs = duration_outputs.masked_select(duration_mask)
+            duration_labels = duration_labels.masked_select(duration_mask)
+            pitch_outputs = pitch_outputs.masked_select(pitch_and_energy_masks)
+            energy_outputs = energy_outputs.masked_select(pitch_and_energy_masks)
+            pitch_labels = pitch_labels.masked_select(pitch_and_energy_masks)
+            energy_labels = energy_labels.masked_select(pitch_and_energy_masks)
+
+        # calculate loss
+        l1_loss = self.l1_criterion(outputs_before_postnet, spectrogram_labels)
+        if outputs_after_postnet is not None:
+            l1_loss = l1_loss + self.l1_criterion(outputs_after_postnet, spectrogram_labels)
+        duration_labels = torch.log(duration_labels.float() + self.log_domain_offset)
+        duration_loss = self.duration_criterion(duration_outputs, duration_labels)
+        pitch_loss = self.mse_criterion(pitch_outputs, pitch_labels)
+        energy_loss = self.mse_criterion(energy_outputs, energy_labels)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            spectrogram_mask = nn.functional.pad(
+                spectrogram_mask.transpose(1, 2),
+                [0, spectrogram_labels.size(1) - spectrogram_mask.size(1), 0, 0, 0, 0],
+                value=False,
+            ).transpose(1, 2)
+
+            out_weights = spectrogram_mask.float() / spectrogram_mask.sum(dim=1, keepdim=True).float()
+            out_weights /= spectrogram_labels.size(0) * spectrogram_labels.size(2)
+            duration_weights = duration_mask.float() / duration_mask.sum(dim=1, keepdim=True).float()
+            duration_weights /= duration_labels.size(0)
+
+            # apply weight
+            l1_loss = l1_loss.mul(out_weights).masked_select(spectrogram_mask).sum()
+            duration_loss = duration_loss.mul(duration_weights).masked_select(duration_mask).sum()
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
+            energy_loss = energy_loss.mul(pitch_weights).masked_select(pitch_and_energy_masks).sum()
+
+        return l1_loss + duration_loss + pitch_loss + energy_loss
+
+
+@auto_docstring
+class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
+    config: FastSpeech2ConformerConfig
+    base_model_prefix = "fastspeech2_conformer"
+
+    main_input_name = "input_ids"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=1.0 / math.sqrt(module.weight.size(1)))
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                key = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-key, b=key)
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_()
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, FastSpeech2ConformerAttention):
+            nn.init.xavier_uniform_(module.pos_bias_u)
+            nn.init.xavier_uniform_(module.pos_bias_v)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, FastSpeech2ConformerEncoder):
+            module.gradient_checkpointing = value
+
+
+@auto_docstring(
+    custom_intro="""
+    FastSpeech2Conformer Model.
+    """
+)
+class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
+    """
+    FastSpeech 2 module.
+
+    This is a module of FastSpeech 2 described in 'FastSpeech 2: Fast and High-Quality End-to-End Text to Speech'
+    https://huggingface.co/papers/2006.04558. Instead of quantized pitch and energy, we use token-averaged value introduced in
+    FastPitch: Parallel Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers instead of regular
+    Transformers.
+    """
+
+    def __init__(self, config: FastSpeech2ConformerConfig):
+        super().__init__(config)
+        self.config = config
+
+        # store hyperparameters
+        self.vocab_size = config.vocab_size
+        self.num_mel_bins = config.num_mel_bins
+        self.hidden_size = config.hidden_size
+        self.reduction_factor = config.reduction_factor
+        self.stop_gradient_from_pitch_predictor = config.stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = config.stop_gradient_from_energy_predictor
+
+        self.multilingual_model = config.num_languages is not None and config.num_languages > 1
+        if self.multilingual_model:
+            self.language_id_embedding = torch.nn.Embedding(config.num_languages, self.hidden_size)
+
+        self.multispeaker_model = config.num_speakers is not None and config.num_speakers > 1
+        if self.multispeaker_model:
+            self.speaker_id_embedding = torch.nn.Embedding(config.num_speakers, config.hidden_size)
+
+        self.speaker_embed_dim = config.speaker_embed_dim
+        if self.speaker_embed_dim:
+            self.projection = nn.Linear(config.hidden_size + self.speaker_embed_dim, config.hidden_size)
+
+        self.encoder = FastSpeech2ConformerEncoder(config, config.encoder_config, use_encoder_input_layer=True)
+
+        self.duration_predictor = FastSpeech2ConformerDurationPredictor(config)
+
+        self.pitch_predictor = FastSpeech2ConformerVariancePredictor(
+            config,
+            num_layers=config.pitch_predictor_layers,
+            num_chans=config.pitch_predictor_channels,
+            kernel_size=config.pitch_predictor_kernel_size,
+            dropout_rate=config.pitch_predictor_dropout,
+        )
+        # continuous pitch + FastPitch style avg
+        self.pitch_embed = FastSpeech2ConformerVarianceEmbedding(
+            out_channels=self.hidden_size,
+            kernel_size=config.pitch_embed_kernel_size,
+            padding=(config.pitch_embed_kernel_size - 1) // 2,
+            dropout_rate=config.pitch_embed_dropout,
+        )
+
+        self.energy_predictor = FastSpeech2ConformerVariancePredictor(
+            config,
+            num_layers=config.energy_predictor_layers,
+            num_chans=config.energy_predictor_channels,
+            kernel_size=config.energy_predictor_kernel_size,
+            dropout_rate=config.energy_predictor_dropout,
+        )
+        # continuous energy + FastPitch style avg
+        self.energy_embed = FastSpeech2ConformerVarianceEmbedding(
+            out_channels=self.hidden_size,
+            kernel_size=config.energy_embed_kernel_size,
+            padding=(config.energy_embed_kernel_size - 1) // 2,
+            dropout_rate=config.energy_embed_dropout,
+        )
+
+        # The decoder is an encoder
+        self.decoder = FastSpeech2ConformerEncoder(config, config.decoder_config, use_encoder_input_layer=False)
+
+        self.speech_decoder_postnet = FastSpeech2ConformerSpeechDecoderPostnet(config)
+
+        self.criterion = FastSpeech2ConformerLoss(config)
+
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        spectrogram_labels: Optional[torch.FloatTensor] = None,
+        duration_labels: Optional[torch.LongTensor] = None,
+        pitch_labels: Optional[torch.FloatTensor] = None,
+        energy_labels: Optional[torch.FloatTensor] = None,
+        speaker_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        speaker_embedding: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[tuple, FastSpeech2ConformerModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Input sequence of text vectors.
+        spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
+            Batch of padded target features.
+        duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
+            Batch of padded durations.
+        pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+            Batch of padded token-averaged pitch.
+        energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+            Batch of padded token-averaged energy.
+        speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+            Speaker ids used to condition features of speech output by the model.
+        lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+            Language ids used to condition features of speech output by the model.
+        speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
+            Embedding containing conditioning signals for the features of the speech.
+
+        Example:
+
+        ```python
+        >>> from transformers import (
+        ...     FastSpeech2ConformerTokenizer,
+        ...     FastSpeech2ConformerModel,
+        ...     FastSpeech2ConformerHifiGan,
+        ... )
+
+        >>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        >>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
+        >>> input_ids = inputs["input_ids"]
+
+        >>> model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
+        >>> output_dict = model(input_ids, return_dict=True)
+        >>> spectrogram = output_dict["spectrogram"]
+
+        >>> vocoder = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan")
+        >>> waveform = vocoder(spectrogram)
+        >>> print(waveform.shape)
+        torch.Size([1, 49664])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_ids.shape, device=input_ids.device)
+
+        has_missing_labels = (
+            spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
+        )
+        if self.training and has_missing_labels:
+            raise ValueError("All labels must be provided to run in training mode.")
+
+        # forward encoder
+        text_masks = attention_mask.unsqueeze(-2)
+
+        encoder_outputs = self.encoder(
+            input_ids,
+            text_masks,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+
+        # Integrate with language id, speaker id, and speaker embedding
+        if self.multispeaker_model and speaker_ids is not None:
+            speaker_id_embeddings = self.speaker_id_embedding(speaker_ids.view(-1))
+            hidden_states = hidden_states + speaker_id_embeddings.unsqueeze(1)
+
+        if self.multilingual_model and lang_ids is not None:
+            language_id_embbedings = self.language_id_embedding(lang_ids.view(-1))
+            hidden_states = hidden_states + language_id_embbedings.unsqueeze(1)
+
+        if self.speaker_embed_dim is not None and speaker_embedding is not None:
+            embeddings_expanded = (
+                nn.functional.normalize(speaker_embedding).unsqueeze(1).expand(-1, hidden_states.size(1), -1)
+            )
+            hidden_states = self.projection(torch.cat([hidden_states, embeddings_expanded], dim=-1))
+
+        # forward duration predictor and variance predictors
+        duration_mask = ~attention_mask.bool()
+
+        if self.stop_gradient_from_pitch_predictor:
+            pitch_predictions = self.pitch_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
+        else:
+            pitch_predictions = self.pitch_predictor(hidden_states, duration_mask.unsqueeze(-1))
+
+        if self.stop_gradient_from_energy_predictor:
+            energy_predictions = self.energy_predictor(hidden_states.detach(), duration_mask.unsqueeze(-1))
+        else:
+            energy_predictions = self.energy_predictor(hidden_states, duration_mask.unsqueeze(-1))
+
+        duration_predictions = self.duration_predictor(hidden_states)
+        duration_predictions = duration_predictions.masked_fill(duration_mask, 0.0)
+
+        if not self.training:
+            # use prediction in inference
+            embedded_pitch_curve = self.pitch_embed(pitch_predictions)
+            embedded_energy_curve = self.energy_embed(energy_predictions)
+            hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
+            hidden_states = length_regulator(hidden_states, duration_predictions, self.config.speaking_speed)
+        else:
+            # use groundtruth in training
+            embedded_pitch_curve = self.pitch_embed(pitch_labels)
+            embedded_energy_curve = self.energy_embed(energy_labels)
+            hidden_states = hidden_states + embedded_energy_curve + embedded_pitch_curve
+            hidden_states = length_regulator(hidden_states, duration_labels)
+
+        # forward decoder
+        if not self.training:
+            hidden_mask = None
+        else:
+            spectrogram_mask = (spectrogram_labels != -100).any(dim=-1)
+            spectrogram_mask = spectrogram_mask.int()
+            if self.reduction_factor > 1:
+                length_dim = spectrogram_mask.shape[1] - spectrogram_mask.shape[1] % self.reduction_factor
+                spectrogram_mask = spectrogram_mask[:, :, :length_dim]
+            hidden_mask = spectrogram_mask.unsqueeze(-2)
+
+        decoder_outputs = self.decoder(
+            hidden_states,
+            hidden_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        outputs_before_postnet, outputs_after_postnet = self.speech_decoder_postnet(decoder_outputs[0])
+
+        loss = None
+        if self.training:
+            # calculate loss
+            loss_duration_mask = ~duration_mask
+            loss_spectrogram_mask = spectrogram_mask.unsqueeze(-1).bool()
+            loss = self.criterion(
+                outputs_after_postnet=outputs_after_postnet,
+                outputs_before_postnet=outputs_before_postnet,
+                duration_outputs=duration_predictions,
+                pitch_outputs=pitch_predictions,
+                energy_outputs=energy_predictions,
+                spectrogram_labels=spectrogram_labels,
+                duration_labels=duration_labels,
+                pitch_labels=pitch_labels,
+                energy_labels=energy_labels,
+                duration_mask=loss_duration_mask,
+                spectrogram_mask=loss_spectrogram_mask,
+            )
+
+        if not return_dict:
+            postnet_outputs = (outputs_after_postnet,)
+            audio_feature_predictions = (
+                duration_predictions,
+                pitch_predictions,
+                energy_predictions,
+            )
+            outputs = postnet_outputs + encoder_outputs + decoder_outputs[1:] + audio_feature_predictions
+            return ((loss,) + outputs) if loss is not None else outputs
+
+        return FastSpeech2ConformerModelOutput(
+            loss=loss,
+            spectrogram=outputs_after_postnet,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            duration_outputs=duration_predictions,
+            pitch_outputs=pitch_predictions,
+            energy_outputs=energy_predictions,
+        )
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.convs1:
+            weight_norm(layer)
+        for layer in self.convs2:
+            weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    HiFi-GAN vocoder.
+    """
+)
+# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5HifiGan with SpeechT5->FastSpeech2Conformer
+class FastSpeech2ConformerHifiGan(PreTrainedModel):
+    config: FastSpeech2ConformerHifiGanConfig
+    main_input_name = "spectrogram"
+
+    def __init__(self, config: FastSpeech2ConformerHifiGanConfig):
+        super().__init__(config)
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+
+        self.register_buffer("mean", torch.zeros(config.model_in_dim))
+        self.register_buffer("scale", torch.ones(config.model_in_dim))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            weight_norm(layer)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+        weight_norm(self.conv_post)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.conv_post)
+
+    @auto_docstring(
+        custom_intro="""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+        """
+    )
+    def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        spectrogram (`torch.FloatTensor`):
+            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
+
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+        if self.config.normalize_before:
+            spectrogram = (spectrogram - self.mean) / self.scale
+
+        is_batched = spectrogram.dim() == 3
+        if not is_batched:
+            spectrogram = spectrogram.unsqueeze(0)
+
+        hidden_states = spectrogram.transpose(2, 1)
+
+        hidden_states = self.conv_pre(hidden_states)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+
+        if not is_batched:
+            # remove batch dim and collapse tensor to 1-d audio waveform
+            waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
+        else:
+            # remove seq-len dim since this collapses to 1
+            waveform = hidden_states.squeeze(1)
+
+        return waveform
+
+
+@auto_docstring(
+    custom_intro="""
+    The FastSpeech2ConformerModel with a FastSpeech2ConformerHifiGan vocoder head that performs text-to-speech (waveform).
+    """
+)
+class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
+    config: FastSpeech2ConformerWithHifiGanConfig
+
+    def __init__(self, config: FastSpeech2ConformerWithHifiGanConfig):
+        super().__init__(config)
+
+        self.model = FastSpeech2ConformerModel(config.model_config)
+        self.vocoder = FastSpeech2ConformerHifiGan(config.vocoder_config)
+
+        self.config = config
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        spectrogram_labels: Optional[torch.FloatTensor] = None,
+        duration_labels: Optional[torch.LongTensor] = None,
+        pitch_labels: Optional[torch.FloatTensor] = None,
+        energy_labels: Optional[torch.FloatTensor] = None,
+        speaker_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        speaker_embedding: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[tuple, FastSpeech2ConformerModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Input sequence of text vectors.
+        spectrogram_labels (`torch.FloatTensor` of shape `(batch_size, max_spectrogram_length, num_mel_bins)`, *optional*, defaults to `None`):
+            Batch of padded target features.
+        duration_labels (`torch.LongTensor` of shape `(batch_size, sequence_length + 1)`, *optional*, defaults to `None`):
+            Batch of padded durations.
+        pitch_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+            Batch of padded token-averaged pitch.
+        energy_labels (`torch.FloatTensor` of shape `(batch_size, sequence_length + 1, 1)`, *optional*, defaults to `None`):
+            Batch of padded token-averaged energy.
+        speaker_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+            Speaker ids used to condition features of speech output by the model.
+        lang_ids (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*, defaults to `None`):
+            Language ids used to condition features of speech output by the model.
+        speaker_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`, *optional*, defaults to `None`):
+            Embedding containing conditioning signals for the features of the speech.
+
+        Example:
+
+        ```python
+        >>> from transformers import (
+        ...     FastSpeech2ConformerTokenizer,
+        ...     FastSpeech2ConformerWithHifiGan,
+        ... )
+
+        >>> tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
+        >>> inputs = tokenizer("some text to convert to speech", return_tensors="pt")
+        >>> input_ids = inputs["input_ids"]
+
+        >>> model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan")
+        >>> output_dict = model(input_ids, return_dict=True)
+        >>> waveform = output_dict["waveform"]
+        >>> print(waveform.shape)
+        torch.Size([1, 49664])
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.model_config.use_return_dict
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.model_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.model_config.output_hidden_states
+        )
+
+        model_outputs = self.model(
+            input_ids,
+            attention_mask,
+            spectrogram_labels=spectrogram_labels,
+            duration_labels=duration_labels,
+            pitch_labels=pitch_labels,
+            energy_labels=energy_labels,
+            speaker_ids=speaker_ids,
+            lang_ids=lang_ids,
+            speaker_embedding=speaker_embedding,
+            return_dict=return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        if not return_dict:
+            has_missing_labels = (
+                spectrogram_labels is None or duration_labels is None or pitch_labels is None or energy_labels is None
+            )
+            if has_missing_labels:
+                spectrogram = model_outputs[0]
+            else:
+                spectrogram = model_outputs[1]
+        else:
+            spectrogram = model_outputs["spectrogram"]
+        waveform = self.vocoder(spectrogram)
+
+        if not return_dict:
+            return model_outputs + (waveform,)
+
+        return FastSpeech2ConformerWithHifiGanOutput(waveform=waveform, **model_outputs)
+
+
+__all__ = [
+    "FastSpeech2ConformerWithHifiGan",
+    "FastSpeech2ConformerHifiGan",
+    "FastSpeech2ConformerModel",
+    "FastSpeech2ConformerPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..004a1c36f59cc7942a7a132012bdccb40a4a38de
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for FastSpeech2Conformer."""
+
+import json
+import os
+from typing import Optional
+
+import regex
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging, requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+
+
+class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
+    """
+    Construct a FastSpeech2Conformer tokenizer.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
+            The begin of sequence token. Note that for FastSpeech2, it is the same as the `eos_token`.
+        eos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
+            The end of sequence token. Note that for FastSpeech2, it is the same as the `bos_token`.
+        pad_token (`str`, *optional*, defaults to `"<blank>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        should_strip_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to strip the spaces from the list of tokens.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<sos/eos>",
+        eos_token="<sos/eos>",
+        pad_token="<blank>",
+        unk_token="<unk>",
+        should_strip_spaces=False,
+        **kwargs,
+    ):
+        requires_backends(self, "g2p_en")
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+
+        import g2p_en
+
+        self.g2p = g2p_en.G2p()
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            should_strip_spaces=should_strip_spaces,
+            **kwargs,
+        )
+
+        self.should_strip_spaces = should_strip_spaces
+
+    @property
+    def vocab_size(self):
+        return len(self.decoder)
+
+    def get_vocab(self):
+        "Returns vocab as a dict"
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        # expand symbols
+        text = regex.sub(";", ",", text)
+        text = regex.sub(":", ",", text)
+        text = regex.sub("-", " ", text)
+        text = regex.sub("&", "and", text)
+
+        # strip unnecessary symbols
+        text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)
+
+        # strip whitespaces
+        text = regex.sub(r"\s+", " ", text)
+
+        text = text.upper()
+
+        return text, kwargs
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        # phonemize
+        tokens = self.g2p(text)
+
+        if self.should_strip_spaces:
+            tokens = list(filter(lambda s: s != " ", tokens))
+
+        tokens.append(self.eos_token)
+
+        return tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    # Override since phonemes cannot be converted back to strings
+    def decode(self, token_ids, **kwargs):
+        logger.warning(
+            "Phonemes cannot be reliably converted to a string due to the one-many mapping, converting to tokens instead."
+        )
+        return self.convert_ids_to_tokens(token_ids)
+
+    # Override since phonemes cannot be converted back to strings
+    def convert_tokens_to_string(self, tokens, **kwargs):
+        logger.warning(
+            "Phonemes cannot be reliably converted to a string due to the one-many mapping, returning the tokens."
+        )
+        return tokens
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.get_vocab(), ensure_ascii=False))
+
+        return (vocab_file,)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["g2p"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        try:
+            import g2p_en
+
+            self.g2p = g2p_en.G2p()
+        except ImportError:
+            raise ImportError(
+                "You need to install g2p-en to use FastSpeech2ConformerTokenizer. "
+                "See https://pypi.org/project/g2p-en/ for installation."
+            )
+
+
+__all__ = ["FastSpeech2ConformerTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/__init__.py b/phivenv/Lib/site-packages/transformers/models/flaubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e981d9cbcb1e456c206b3bec252df1598e23575a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flaubert/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_flaubert import *
+    from .modeling_flaubert import *
+    from .modeling_tf_flaubert import *
+    from .tokenization_flaubert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b32368b0a7d7c1cb1ce279e8a8d515d1bc41e7e7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/configuration_flaubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/configuration_flaubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8d66b4da16fb3011406c8bb1be7185f9e90f89e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/configuration_flaubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/modeling_flaubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/modeling_flaubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed45eaa512fcf155ab3475735389c33e671a6921
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/modeling_flaubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/modeling_tf_flaubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/modeling_tf_flaubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..659f4cd29d8ee479210478e83681a35889c707c0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/modeling_tf_flaubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/tokenization_flaubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/tokenization_flaubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b84eab929d9c5f018a6b50246e8e8f0857b8fba1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flaubert/__pycache__/tokenization_flaubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/configuration_flaubert.py b/phivenv/Lib/site-packages/transformers/models/flaubert/configuration_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..071a74fe69b420954ff6ce7154b227c0e0d7e4dd
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flaubert/configuration_flaubert.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flaubert configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FlaubertConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FlaubertModel`] or a [`TFFlaubertModel`]. It is
+    used to instantiate a FlauBERT model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FlauBERT
+    [flaubert/flaubert_base_uncased](https://huggingface.co/flaubert/flaubert_base_uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply the layer normalization before or after the feed forward layer following the attention in
+            each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with
+            Structured Dropout. ICLR 2020)
+        vocab_size (`int`, *optional*, defaults to 30145):
+            Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`FlaubertModel`] or [`TFFlaubertModel`].
+        emb_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a *gelu* activation instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
+            order to only attend to the left-side context instead if a bidirectional context.
+        asm (`bool`, *optional*, defaults to `False`):
+            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (`int`, *optional*, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for information
+            on how to use them.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
+        init_std (`int`, *optional*, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
+            embedding matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (`int`, *optional*, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (`int`, *optional*, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (`int`, *optional*, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (`int`, *optional*, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (`int`, *optional*, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(`bool`, *optional*, defaults to `True`):
+            Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (`string`, *optional*, defaults to "first"):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Used in the sequence classification and multiple choice models.
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Used in the sequence classification and multiple choice models.
+
+            The dropout ratio to be used after the projection and activation.
+        start_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        end_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        mask_token_id (`int`, *optional*, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (`int`, *optional*, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating text in a given language.
+    """
+
+    model_type = "flaubert"
+    attribute_map = {
+        "hidden_size": "emb_dim",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+        "n_words": "vocab_size",  # For backward compatibility
+    }
+
+    def __init__(
+        self,
+        pre_norm=False,
+        layerdrop=0.0,
+        vocab_size=30145,
+        emb_dim=2048,
+        n_layers=12,
+        n_heads=16,
+        dropout=0.1,
+        attention_dropout=0.1,
+        gelu_activation=True,
+        sinusoidal_embeddings=False,
+        causal=False,
+        asm=False,
+        n_langs=1,
+        use_lang_emb=True,
+        max_position_embeddings=512,
+        embed_init_std=2048**-0.5,
+        layer_norm_eps=1e-12,
+        init_std=0.02,
+        bos_index=0,
+        eos_index=1,
+        pad_index=2,
+        unk_index=3,
+        mask_index=5,
+        is_encoder=True,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        mask_token_id=0,
+        lang_id=0,
+        pad_token_id=2,
+        bos_token_id=0,
+        **kwargs,
+    ):
+        """Constructs FlaubertConfig."""
+        self.pre_norm = pre_norm
+        self.layerdrop = layerdrop
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.gelu_activation = gelu_activation
+        self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.causal = causal
+        self.asm = asm
+        self.n_langs = n_langs
+        self.use_lang_emb = use_lang_emb
+        self.layer_norm_eps = layer_norm_eps
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.mask_index = mask_index
+        self.is_encoder = is_encoder
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_init_std = embed_init_std
+        self.init_std = init_std
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.summary_first_dropout = summary_first_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+        self.mask_token_id = mask_token_id
+        self.lang_id = lang_id
+
+        if "n_words" in kwargs:
+            self.n_words = kwargs["n_words"]
+
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
+
+
+class FlaubertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["FlaubertConfig", "FlaubertOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/modeling_flaubert.py b/phivenv/Lib/site-packages/transformers/models/flaubert/modeling_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dadc6f5377b4007c78f44d81d7a39acf2dedbdb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flaubert/modeling_flaubert.py
@@ -0,0 +1,1700 @@
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Flaubert model, based on XLM."""
+
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import gelu, get_activation
+from ...cache_utils import DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging
+from .configuration_flaubert import FlaubertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    out.requires_grad = False
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+
+
+# Copied from transformers.models.xlm.modeling_xlm.get_masks
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        assert lengths.max().item() <= slen
+        mask = alen < lengths[:, None]
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    bs = lengths.size(0)
+    if causal:
+        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+
+    # sanity check
+    assert mask.size() == (bs, slen)
+    assert causal is False or attn_mask.size() == (bs, slen, slen)
+
+    return mask, attn_mask
+
+
+# Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_heads, dim, config, layer_idx: int = 0):
+        super().__init__()
+        self.layer_id = layer_idx
+        self.dim = dim
+        self.n_heads = n_heads
+        self.head_dim = dim // n_heads
+        self.dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        input,
+        mask,
+        kv=None,
+        cache=None,
+        head_mask=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        is_cross_attention = kv is not None
+        mask_reshape = (bs, 1, qlen, -1) if mask.dim() == 3 else (bs, 1, 1, -1)
+
+        q = self.q_lin(input).view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2)
+        if cache is not None:
+            if isinstance(cache, EncoderDecoderCache):
+                is_updated = cache.is_updated.get(self.layer_id)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = cache.cross_attention_cache
+                else:
+                    curr_past_key_value = cache.self_attention_cache
+            else:
+                curr_past_key_value = cache
+
+        current_states = kv if is_cross_attention else input
+        if is_cross_attention and cache is not None and is_updated:
+            # reuse k,v, cross_attentions
+            k = curr_past_key_value.key_cache[self.layer_id]
+            v = curr_past_key_value.value_cache[self.layer_id]
+        else:
+            k = self.k_lin(current_states)
+            v = self.v_lin(current_states)
+            k = k.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2)
+            v = v.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2)
+
+            if cache is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                k, v = curr_past_key_value.update(k, v, self.layer_id, {"cache_position": cache_position})
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    cache.is_updated[self.layer_id] = True
+
+        q = q / math.sqrt(self.head_dim)  # (bs, n_heads, qlen, head_dim)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, torch.finfo(scores.dtype).min)  # (bs, n_heads, qlen, klen)
+
+        weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
+        weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, head_dim)
+        context = context.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * self.head_dim)
+
+        outputs = (self.out_lin(context),)
+        if output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+
+
+# Copied from transformers.models.xlm.modeling_xlm.TransformerFFN
+class TransformerFFN(nn.Module):
+    def __init__(self, in_dim, dim_hidden, out_dim, config):
+        super().__init__()
+        self.dropout = config.dropout
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
+        self.act = gelu if config.gelu_activation else nn.functional.relu
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+    def forward(self, input):
+        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
+
+    def ff_chunk(self, input):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMPredLayer with XLM->Flaubert
+class FlaubertPredLayer(nn.Module):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        dim = config.emb_dim
+
+        if config.asm is False:
+            self.proj = nn.Linear(dim, config.n_words, bias=True)
+        else:
+            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+                in_features=dim,
+                n_classes=config.n_words,
+                cutoffs=config.asm_cutoffs,
+                div_value=config.asm_div_value,
+                head_bias=True,  # default is False
+            )
+
+    def forward(self, x, y=None):
+        """Compute the loss, and optionally the scores."""
+        outputs = ()
+        if self.asm is False:
+            scores = self.proj(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean")
+                outputs = (loss,) + outputs
+        else:
+            scores = self.proj.log_prob(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                _, loss = self.proj(x, y)
+                outputs = (loss,) + outputs
+
+        return outputs
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`].
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMSquadHeadOutput with XLM->Flaubert
+class FlaubertSquadHeadOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+        Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+        losses.
+    start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+    start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top config.start_n_top start token possibilities (beam-search).
+    end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+        (beam-search).
+    end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+    cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the `is_impossible` label of the answers.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerStartLogits with XLM->Flaubert
+class FlaubertPoolerStartLogits(nn.Module):
+    """
+    Compute SQuAD start logits from sequence hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        Returns:
+            `torch.FloatTensor`: The start logits for SQuAD.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerEndLogits with XLM->Flaubert
+class FlaubertPoolerEndLogits(nn.Module):
+    """
+    Compute SQuAD end logits from sequence hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The end logits for SQuAD.
+        """
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
+
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerAnswerClass with XLM->Flaubert
+class FlaubertPoolerAnswerClass(nn.Module):
+    """
+    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The SQuAD 2.0 answer class.
+        """
+        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
+        hsz = hidden_states.shape[-1]
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
+
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSQuADHead with XLM->Flaubert
+class FlaubertSQuADHead(nn.Module):
+    r"""
+    A SQuAD head inspired by XLNet.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.start_logits = FlaubertPoolerStartLogits(config)
+        self.end_logits = FlaubertPoolerEndLogits(config)
+        self.answer_class = FlaubertPoolerAnswerClass(config)
+
+    @auto_docstring
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+        is_impossible: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = False,
+    ) -> Union[FlaubertSquadHeadOutput, tuple[torch.FloatTensor]]:
+        r"""
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+            Final hidden states of the model on the sequence tokens.
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Positions of the first token for the labeled span.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Positions of the last token for the labeled span.
+        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Whether the question has a possible answer in the paragraph or not.
+        p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+            Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+            should be masked.
+        """
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+
+            return FlaubertSquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+
+            if not return_dict:
+                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
+            else:
+                return FlaubertSquadHeadOutput(
+                    start_top_log_probs=start_top_log_probs,
+                    start_top_index=start_top_index,
+                    end_top_log_probs=end_top_log_probs,
+                    end_top_index=end_top_index,
+                    cls_logits=cls_logits,
+                )
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Flaubert
+class FlaubertSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMPreTrainedModel with XLM->Flaubert
+class FlaubertPreTrainedModel(PreTrainedModel):
+    config: FlaubertConfig
+    load_tf_weights = None
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Embedding):
+            if self.config is not None and self.config.embed_init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, nn.Linear):
+            if self.config is not None and self.config.init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0.0)
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(
+                self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight
+            )
+
+
+@auto_docstring
+class FlaubertModel(FlaubertPreTrainedModel):
+    def __init__(self, config):  # , dico, is_encoder, with_output):
+        super().__init__(config)
+
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently Flaubert can only be used as an encoder")
+        # self.with_output = with_output
+        self.causal = config.causal
+
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = config.emb_dim  # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads  # 8 by default
+        self.n_layers = config.n_layers
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
+
+        # embeddings
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
+        if config.n_langs > 1 and config.use_lang_emb:
+            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
+        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
+
+        # transformer layers
+        self.attentions = nn.ModuleList()
+        self.layer_norm1 = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        self.layer_norm2 = nn.ModuleList()
+        # if self.is_decoder:
+        #     self.layer_norm15 = nn.ModuleList()
+        #     self.encoder_attn = nn.ModuleList()
+
+        for i in range(self.n_layers):
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config, layer_idx=i))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.attentions[layer].prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        lengths: Optional[torch.LongTensor] = None,
+        cache: Optional[dict[str, torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding. The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # removed: src_enc=None, src_len=None
+        if input_ids is not None:
+            bs, slen = input_ids.size()
+        else:
+            bs, slen = inputs_embeds.size()[:-1]
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if cache is None:
+            cache = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if isinstance(cache, tuple):
+            cache = EncoderDecoderCache.from_legacy_cache(cache)
+
+        if lengths is None:
+            if input_ids is not None:
+                lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            else:
+                lengths = torch.tensor([slen] * bs, device=device)
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        assert lengths.size(0) == bs
+        assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # Setting the position-ids to the registered buffer in constructor, it helps
+        # when tracing the model without passing position-ids, solves
+        # issues similar to issue #5664
+        if position_ids is None:
+            if hasattr(self, "position_ids"):
+                position_ids = self.position_ids[:, :slen]
+                position_ids = position_ids.expand((bs, slen))
+            else:
+                position_ids = torch.arange(slen, dtype=torch.long, device=device)
+                position_ids = position_ids.unsqueeze(0).expand((bs, slen))
+        else:
+            assert position_ids.size() == (bs, slen)  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            assert langs.size() == (bs, slen)  # (slen, bs)
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.n_layers)
+
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache.get_seq_length()
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
+        if langs is not None and self.use_lang_emb and self.config.n_langs > 1:
+            tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training)
+        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # transformer layers
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        for i in range(self.n_layers):
+            # LayerDrop
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            if not self.pre_norm:
+                attn_outputs = self.attentions[i](
+                    tensor,
+                    attn_mask,
+                    cache=cache,
+                    head_mask=head_mask[i],
+                    output_attentions=output_attentions,
+                    cache_position=cache_position,
+                )
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+                tensor = tensor + attn
+                tensor = self.layer_norm1[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm1[i](tensor)
+                attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i])
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+                tensor = tensor + attn
+
+            # FFN
+            if not self.pre_norm:
+                tensor = tensor + self.ffns[i](tensor)
+                tensor = self.layer_norm2[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm2[i](tensor)
+                tensor = tensor + self.ffns[i](tensor_normalized)
+
+            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+
+
+@auto_docstring(
+    custom_intro="""
+    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["pred_layer.proj.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FlaubertModel(config)
+        self.pred_layer = FlaubertPredLayer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.pred_layer.proj
+
+    def set_output_embeddings(self, new_embeddings):
+        self.pred_layer.proj = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Overwritten -- uses a language id
+
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        effective_batch_size = input_ids.shape[0]
+        mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
+        input_ids = torch.cat([input_ids, mask_token], dim=1)
+        if lang_id is not None:
+            langs = torch.full_like(input_ids, lang_id)
+        else:
+            langs = None
+        return {"input_ids": input_ids, "langs": langs}
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding. The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output, labels)  # (loss, logits) or (logits,) depending on if labels are provided.
+
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+
+        return MaskedLMOutput(
+            loss=outputs[0] if labels is not None else None,
+            logits=outputs[0] if labels is None else outputs[1],
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    e.g. for GLUE tasks.
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForSequenceClassification(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.transformer = FlaubertModel(config)
+        self.sequence_summary = FlaubertSequenceSummary(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForTokenClassification(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = FlaubertModel(config)
+        self.dropout = nn.Dropout(config.dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = FlaubertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of question answering models using a `SquadHead`.
+    """
+)
+# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput with XLM->Flaubert
+class FlaubertForQuestionAnsweringOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+        Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+        losses.
+    start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+    start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top config.start_n_top start token possibilities (beam-search).
+    end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+        (beam-search).
+    end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+    cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the `is_impossible` label of the answers.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnswering with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForQuestionAnswering(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = FlaubertModel(config)
+        self.qa_outputs = FlaubertSQuADHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        is_impossible: Optional[torch.Tensor] = None,
+        cls_index: Optional[torch.Tensor] = None,
+        p_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FlaubertForQuestionAnsweringOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the classification token to use as input for computing plausibility of the
+            answer.
+        p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
+            masked. 0.0 mean token is not masked.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FlaubertForQuestionAnswering
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+        >>> model = FlaubertForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        ... )  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+
+        outputs = self.qa_outputs(
+            output,
+            start_positions=start_positions,
+            end_positions=end_positions,
+            cls_index=cls_index,
+            is_impossible=is_impossible,
+            p_mask=p_mask,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+
+        return FlaubertForQuestionAnsweringOutput(
+            loss=outputs.loss,
+            start_top_log_probs=outputs.start_top_log_probs,
+            start_top_index=outputs.start_top_index,
+            end_top_log_probs=outputs.end_top_log_probs,
+            end_top_index=outputs.end_top_index,
+            cls_logits=outputs.cls_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForMultipleChoice(FlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = FlaubertModel(config)
+        self.sequence_summary = FlaubertSequenceSummary(config)
+        self.logits_proj = nn.Linear(config.num_labels, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        langs (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        langs = langs.view(-1, langs.size(-1)) if langs is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
+                "attention mask instead."
+            )
+            lengths = None
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "FlaubertForMultipleChoice",
+    "FlaubertForQuestionAnswering",
+    "FlaubertForQuestionAnsweringSimple",
+    "FlaubertForSequenceClassification",
+    "FlaubertForTokenClassification",
+    "FlaubertModel",
+    "FlaubertWithLMHeadModel",
+    "FlaubertPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py b/phivenv/Lib/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..88b7ae9f0c9ddd96e761b3739a035c55a217dec8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -0,0 +1,1343 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 Flaubert model.
+"""
+
+from __future__ import annotations
+
+import itertools
+import random
+import warnings
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_flaubert import FlaubertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
+_CONFIG_FOR_DOC = "FlaubertConfig"
+
+
+FLAUBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+FLAUBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - `1` for tokens that are **not masked**,
+            - `0` for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        langs (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - `0` corresponds to a *sentence A* token,
+            - `1` corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`tf.Tensor` or `Numpy array` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`dict[str, tf.Tensor]`, *optional*):
+            Dictionary string to `tf.FloatTensor` that contains precomputed hidden states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding.
+
+            The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - `1` indicates the head is **not masked**,
+            - `0` indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    bs = shape_list(lengths)[0]
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        # assert lengths.max().item() <= slen
+        alen = tf.range(slen, dtype=lengths.dtype)
+        mask = alen < tf.expand_dims(lengths, axis=1)
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = tf.less_equal(
+            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
+        )
+    else:
+        attn_mask = mask
+
+    # sanity check
+    # assert shape_list(mask) == [bs, slen]
+    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
+    if causal:
+        tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])
+
+    return mask, attn_mask
+
+
+class TFFlaubertPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = FlaubertConfig
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed
+        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
+        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": inputs_list,
+                "attention_mask": attns_list,
+                "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
+            }
+        else:
+            return {"input_ids": inputs_list, "attention_mask": attns_list}
+
+
+@add_start_docstrings(
+    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertModel(TFFlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutput:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert
+class TFFlaubertMultiHeadAttention(keras.layers.Layer):
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config, **kwargs):
+        super().__init__(**kwargs)
+        self.layer_id = next(TFFlaubertMultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.output_attentions = config.output_attentions
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
+        self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
+        self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
+        self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
+        self.dropout = keras.layers.Dropout(config.attention_dropout)
+        self.pruned_heads = set()
+        self.dim = dim
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, input, mask, kv, cache, head_mask, output_attentions, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+
+        if kv is None:
+            klen = qlen if cache is None else cache["slen"] + qlen
+        else:
+            klen = shape_list(kv)[1]
+
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        dim_per_head = self.dim // self.n_heads
+        mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """projection"""
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """compute context"""
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+
+        if kv is None:
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+
+            cache[self.layer_id] = (k, v)
+
+        f_dim_per_head = tf.cast(dim_per_head, dtype=q.dtype)
+        q = tf.multiply(q, tf.math.rsqrt(f_dim_per_head))  # (bs, n_heads, qlen, dim_per_head)
+        k = tf.cast(k, dtype=q.dtype)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, qlen, klen)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
+        mask = tf.cast(mask, dtype=scores.dtype)
+        scores = scores - 1e30 * (1.0 - mask)
+        weights = stable_softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
+        outputs = (self.out_lin(context),)
+
+        if output_attentions:
+            outputs = outputs + (weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_lin", None) is not None:
+            with tf.name_scope(self.q_lin.name):
+                self.q_lin.build([None, None, self.dim])
+        if getattr(self, "k_lin", None) is not None:
+            with tf.name_scope(self.k_lin.name):
+                self.k_lin.build([None, None, self.dim])
+        if getattr(self, "v_lin", None) is not None:
+            with tf.name_scope(self.v_lin.name):
+                self.v_lin.build([None, None, self.dim])
+        if getattr(self, "out_lin", None) is not None:
+            with tf.name_scope(self.out_lin.name):
+                self.out_lin.build([None, None, self.dim])
+
+
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMTransformerFFN
+class TFFlaubertTransformerFFN(keras.layers.Layer):
+    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
+        self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
+        self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.in_dim = in_dim
+        self.dim_hidden = dim_hidden
+
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "lin1", None) is not None:
+            with tf.name_scope(self.lin1.name):
+                self.lin1.build([None, None, self.in_dim])
+        if getattr(self, "lin2", None) is not None:
+            with tf.name_scope(self.lin2.name):
+                self.lin2.build([None, None, self.dim_hidden])
+
+
+@keras_serializable
+class TFFlaubertMainLayer(keras.layers.Layer):
+    config_class = FlaubertConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.n_heads = config.n_heads
+        self.n_langs = config.n_langs
+        self.dim = config.emb_dim
+        self.hidden_dim = self.dim * 4
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        self.causal = config.causal
+        self.n_layers = config.n_layers
+        self.use_lang_emb = config.use_lang_emb
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+        self.max_position_embeddings = config.max_position_embeddings
+        self.embed_init_std = config.embed_init_std
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.embeddings = TFSharedEmbeddings(
+            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
+        )
+        self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
+        self.attentions = []
+        self.layer_norm1 = []
+        self.ffns = []
+        self.layer_norm2 = []
+
+        for i in range(self.n_layers):
+            self.attentions.append(
+                TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}")
+            )
+            self.layer_norm1.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}")
+            )
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(
+                TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}")
+            )
+            self.layer_norm2.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}")
+            )
+
+    def build(self, input_shape=None):
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.dim],
+                initializer=get_initializer(self.embed_init_std),
+            )
+
+        if self.n_langs > 1 and self.use_lang_emb:
+            with tf.name_scope("lang_embeddings"):
+                self.lang_embeddings = self.add_weight(
+                    name="embeddings",
+                    shape=[self.n_langs, self.dim],
+                    initializer=get_initializer(self.embed_init_std),
+                )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "layer_norm_emb", None) is not None:
+            with tf.name_scope(self.layer_norm_emb.name):
+                self.layer_norm_emb.build([None, None, self.dim])
+        for layer in self.attentions:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm1:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+        for layer in self.ffns:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm2:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutput:
+        # removed: src_enc=None, src_len=None
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            bs, slen = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            bs, slen = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if lengths is None:
+            if input_ids is not None:
+                lengths = tf.reduce_sum(
+                    tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=input_ids.dtype), axis=1
+                )
+            else:
+                lengths = tf.convert_to_tensor([slen] * bs)
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        # assert shape_list(lengths)[0] == bs
+        (
+            tf.debugging.assert_equal(shape_list(lengths)[0], bs),
+            f"Expected batch size {shape_list(lengths)[0]} and received batch size {bs} mismatched",
+        )
+        # assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # position_ids
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(slen), axis=0)
+            position_ids = tf.tile(position_ids, (bs, 1))
+
+        # assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
+        (
+            tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]),
+            f"Position id shape {shape_list(position_ids)} and input shape {[bs, slen]} mismatched",
+        )
+        # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            # assert shape_list(langs) == [bs, slen]  # (slen, bs)
+            (
+                tf.debugging.assert_equal(shape_list(langs), [bs, slen]),
+                f"Lang shape {shape_list(langs)} and input shape {[bs, slen]} mismatched",
+            )
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layers
+
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache["slen"]
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
+            inputs_embeds = self.embeddings(input_ids)
+
+        tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
+
+        if langs is not None and self.use_lang_emb:
+            tensor = tensor + tf.gather(self.lang_embeddings, langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+
+        tensor = self.layer_norm_emb(tensor)
+        tensor = self.dropout(tensor, training=training)
+        mask = tf.cast(mask, dtype=tensor.dtype)
+        tensor = tensor * tf.expand_dims(mask, axis=-1)
+
+        # hidden_states and attentions cannot be None in graph mode.
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+
+        # transformer layers
+        for i in range(self.n_layers):
+            # LayerDrop
+            dropout_probability = random.uniform(0, 1)
+
+            if training and (dropout_probability < self.layerdrop):
+                continue
+
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            if not self.pre_norm:
+                attn_outputs = self.attentions[i](
+                    tensor,
+                    attn_mask,
+                    None,
+                    cache,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                attn = attn_outputs[0]
+
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+
+                attn = self.dropout(attn, training=training)
+                tensor = tensor + attn
+                tensor = self.layer_norm1[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm1[i](tensor)
+                attn_outputs = self.attentions[i](
+                    tensor_normalized,
+                    attn_mask,
+                    None,
+                    cache,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                attn = attn_outputs[0]
+
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+
+                attn = self.dropout(attn, training=training)
+                tensor = tensor + attn
+
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            if not self.pre_norm:
+                tensor = tensor + self.ffns[i](tensor)
+                tensor = self.layer_norm2[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm2[i](tensor)
+                tensor = tensor + self.ffns[i](tensor_normalized)
+
+            tensor = tensor * tf.expand_dims(mask, axis=-1)
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache["slen"] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+
+        return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+
+
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMPredLayer
+class TFFlaubertPredLayer(keras.layers.Layer):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+
+        if config.asm is False:
+            self.input_embeddings = input_embeddings
+        else:
+            raise NotImplementedError
+            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+            #     in_features=dim,
+            #     n_classes=config.n_words,
+            #     cutoffs=config.asm_cutoffs,
+            #     div_value=config.asm_div_value,
+            #     head_bias=True,  # default is False
+            # )
+
+    def build(self, input_shape):
+        # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
+        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
+
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+
+        return hidden_states
+
+
+@dataclass
+class TFFlaubertWithLMHeadModelOutput(ModelOutput):
+    """
+    Base class for [`TFFlaubertWithLMHeadModel`] outputs.
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+@add_start_docstrings(
+    """
+    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
+        # Flaubert does not have past caching features
+        self.supports_xla_generation = False
+
+    def get_lm_head(self):
+        return self.pred_layer
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.pred_layer.name
+
+    def prepare_inputs_for_generation(self, inputs, **kwargs):
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        effective_batch_size = inputs.shape[0]
+        mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id
+        inputs = tf.concat([inputs, mask_token], axis=1)
+
+        if lang_id is not None:
+            langs = tf.ones_like(inputs) * lang_id
+        else:
+            langs = None
+        return {"input_ids": inputs, "langs": langs}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFFlaubertWithLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFFlaubertWithLMHeadModelOutput:
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output)
+
+        if not return_dict:
+            return (outputs,) + transformer_outputs[1:]
+
+        return TFFlaubertWithLMHeadModelOutput(
+            logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "pred_layer", None) is not None:
+            with tf.name_scope(self.pred_layer.name):
+                self.pred_layer.build(None)
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    e.g. for GLUE tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForSequenceClassification(TFFlaubertPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+
+        logits = self.sequence_summary(output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+        self.logits_proj = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
+        self.config = config
+
+    @property
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+                "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+        else:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
+                "attention mask instead.",
+            )
+            lengths = None
+
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_langs,
+            flat_token_type_ids,
+            flat_position_ids,
+            lengths,
+            cache,
+            head_mask,
+            flat_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "logits_proj", None) is not None:
+            with tf.name_scope(self.logits_proj.name):
+                self.logits_proj.build([None, None, self.config.num_labels])
+
+
+__all__ = [
+    "TFFlaubertForMultipleChoice",
+    "TFFlaubertForQuestionAnsweringSimple",
+    "TFFlaubertForSequenceClassification",
+    "TFFlaubertForTokenClassification",
+    "TFFlaubertModel",
+    "TFFlaubertPreTrainedModel",
+    "TFFlaubertWithLMHeadModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/flaubert/tokenization_flaubert.py b/phivenv/Lib/site-packages/transformers/models/flaubert/tokenization_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee653450ebacc02bcfee8142c5d08b54358c1a9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flaubert/tokenization_flaubert.py
@@ -0,0 +1,538 @@
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Flaubert."""
+
+import json
+import os
+import re
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+def convert_to_unicode(text):
+    """
+    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
+    """
+
+    def ensure_text(s, encoding="utf-8", errors="strict"):
+        if isinstance(s, bytes):
+            return s.decode(encoding, errors)
+        elif isinstance(s, str):
+            return s
+        else:
+            raise TypeError(f"not expecting type '{type(s)}'")
+
+    return ensure_text(text, encoding="utf-8", errors="ignore")
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct
+def replace_unicode_punct(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", "1")
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
+    return text
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char
+def remove_non_printing_char(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    """
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith("C"):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+class FlaubertTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
+
+    - Moses preprocessing and tokenization.
+    - Normalizing all inputs text.
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
+      "__classify__") to a vocabulary.
+    - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Vocabulary file.
+        merges_file (`str`):
+            Merges file.
+        do_lowercase (`bool`, *optional*, defaults to `False`):
+            Controls lower casing.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"</s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<special1>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
+            List of additional special tokens.
+        lang2id (`Dict[str, int]`, *optional*):
+            Dictionary mapping languages string identifiers to their IDs.
+        id2lang (`Dict[int, str]`, *optional*):
+            Dictionary mapping language IDs to their string identifiers.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        do_lowercase=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="</s>",
+        mask_token="<special1>",
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        **kwargs,
+    ):
+        do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
+        if do_lowercase_and_remove_accent is not None:
+            logger.warning(
+                "`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything."
+                " `FlaubertTokenizer` will always set it to `False`."
+            )
+        # always `False`
+        self.do_lowercase_and_remove_accent = False
+
+        self.do_lowercase = do_lowercase
+
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use FlaubertTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = {}
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+        super().__init__(
+            do_lowercase=do_lowercase,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            **kwargs,
+        )
+
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
+    def do_lower_case(self):
+        return self.do_lowercase_and_remove_accent
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize
+    def ja_tokenize(self, text):
+        if self.ja_word_tokenizer is None:
+            try:
+                import Mykytea
+
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
+                )
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
+                    " (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
+                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
+                raise
+        return list(self.ja_word_tokenizer.getWS(text))
+
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.encoder)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
+        self.cache[token] = word
+        return word
+
+    def preprocess_text(self, text):
+        text = text.replace("``", '"').replace("''", '"')
+        text = convert_to_unicode(text)
+        text = unicodedata.normalize("NFC", text)
+
+        if self.do_lowercase:
+            text = text.lower()
+
+        return text
+
+    def _tokenize(self, text, bypass_tokenizer=False):
+        """
+        Tokenize a string given language code using Moses.
+
+        Details of tokenization:
+
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+
+        Args:
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
+
+        Returns:
+            List of tokens.
+        """
+        lang = "fr"
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by"
+                " the loaded pretrained model."
+            )
+
+        if bypass_tokenizer:
+            text = text.split()
+        else:
+            text = self.preprocess_text(text)
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.moses_tokenize(text, lang=lang)
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).replace("</w>", " ").strip()
+        return out_string
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+
+        """
+        bos = [self.bos_token_id]
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return bos + token_ids_0 + sep
+        return bos + token_ids_0 + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sm"] = None
+        return state
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+
+__all__ = ["FlaubertTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__init__.py b/phivenv/Lib/site-packages/transformers/models/flava/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..292593cb4a201e35a9fd571baec639d9b940e76c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flava/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_flava import *
+    from .feature_extraction_flava import *
+    from .image_processing_flava import *
+    from .image_processing_flava_fast import *
+    from .modeling_flava import *
+    from .processing_flava import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26b3b9598bf1d18bda8f44d1da0b64b80f8a5b59
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b1b91880261cad88e415e9197d04d49851e6cf4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82913f1ed0b7ca3a5be52f29351260cf1a246ebe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d449cffd8f9fc86215d0631c7700095a19ac251
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b58598f9c92fc972b5cd30bd9b6ba744eb285098
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/modeling_flava.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/modeling_flava.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eac8bb86e4a71bccc17a3f9c84ef1546e58535e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/modeling_flava.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66c42be669847f9583c6bc583c56fcd1c3de2898
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/configuration_flava.py b/phivenv/Lib/site-packages/transformers/models/flava/configuration_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3ecf68a898271f4af5d725424843c8063066588
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flava/configuration_flava.py
@@ -0,0 +1,701 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FLAVA model configurations"""
+
+from typing import Any, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FlavaImageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaImageModel`]. It is used to instantiate an
+    FLAVA model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        mask_token (`bool`, *optional*, defaults to `True`):
+            Whether to use a mask token or not. Used in MIM (Masked Image Modeling) loss for FLAVA.
+        vocab_size (`int`, *optional*, defaults to 8192):
+            Vocabulary size of the [`FlavaImageCodebook`] used in conjunction with [`FlavaImageModel`] for MIM (Masked
+            Image Modeling) loss for FLAVA.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaImageConfig, FlavaImageModel
+
+    >>> # Initializing a FlavaImageModel with  style configuration
+    >>> configuration = FlavaImageConfig()
+
+    >>> # Initializing a FlavaImageModel model (with random weights) from the style configuration
+    >>> model = FlavaImageModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "flava_image_model"
+    base_config_key = "image_config"
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: int = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        patch_size: int = 16,
+        num_channels: int = 3,
+        qkv_bias: bool = True,
+        mask_token: bool = True,
+        vocab_size: int = 8192,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.mask_token = mask_token
+        self.vocab_size = vocab_size
+
+
+class FlavaTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaTextModel`]. It is used to instantiate an
+    FLAVA model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FlavaTextModel`].
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`FlavaTextModel`]. Note that even though
+            text encoder allows `token_type_ids`'s value as 2, for text-only pretraining and fine-tuning, only 1 is
+            used similar to RoBERTa.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048). For VL, max_length passed to model is 77.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaTextConfig, FlavaTextModel
+
+    >>> # Initializing a FlavaTextModel with  style configuration
+    >>> configuration = FlavaTextConfig()
+
+    >>> # Initializing a FlavaTextModel model (with random weights) from the style configuration
+    >>> model = FlavaTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "flava_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        type_vocab_size: int = 2,
+        max_position_embeddings: int = 512,
+        position_embedding_type: str = "absolute",
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        pad_token_id: int = 0,
+        qkv_bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.position_embedding_type = position_embedding_type
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.pad_token_id = pad_token_id
+
+
+class FlavaMultimodalConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaMultimodalModel`]. It is used to instantiate
+    an FLAVA model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        use_cls_token (`bool`, *optional*, defaults to `True`):
+            Whether to use an extra CLS token for multimodal settings. Usually needed by the FLAVA model.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaMultimodalConfig, FlavaMultimodalModel
+
+    >>> # Initializing a FlavaMultimodalModel with  style configuration
+    >>> configuration = FlavaMultimodalConfig()
+
+    >>> # Initializing a FlavaMultimodalModel model (with random weights) from the style configuration
+    >>> model = FlavaMultimodalModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "flava_multimodal_model"
+    base_config_key = "multimodal_config"
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 6,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: int = "gelu",
+        hidden_dropout_prob: int = 0.0,
+        attention_probs_dropout_prob: int = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        qkv_bias: bool = True,
+        use_cls_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_cls_token = use_cls_token
+
+
+class FlavaImageCodebookConfig(PretrainedConfig):
+    model_type = "flava_image_codebook"
+    base_config_key = "image_codebook_config"
+
+    r"""
+    [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
+    is used to instantiate an FLAVA model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-image-codebook](https://huggingface.co/facebook/flava-image-codebook) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_groups (`int`, *optional*, defaults to 4):
+            Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
+            internal calculation and estimations.
+        input_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the image to be passed.
+        num_blocks_per_group (`int`, *optional*, defaults to 2):
+            Number of conv-based blocks per group.
+        hidden_size (`int`, *optional*, defaults to 256):
+            Size of hidden dim for the blocks.
+        vocab_size (`int`, *optional*, defaults to 8192):
+            Size of the output vocabulary for the codebook.
+        freeze (`bool`, defaults to `True`):
+            Whether to freeze the weights of the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaImageCodebookConfig, FlavaImageCodebook
+
+    >>> # Initializing a FlavaImageCodebook with style configuration
+    >>> configuration = FlavaImageCodebookConfig()
+
+    >>> # Initializing a FlavaImageCodebook model (with random weights) from the style configuration
+    >>> model = FlavaImageCodebook(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    def __init__(
+        self,
+        num_groups: int = 4,
+        input_channels: int = 3,
+        num_blocks_per_group: int = 2,
+        hidden_size: int = 256,
+        vocab_size: int = 8192,
+        freeze: int = True,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_groups = num_groups
+        self.input_channels = input_channels
+        self.num_blocks_per_group = num_blocks_per_group
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.freeze = freeze
+        self.initializer_range = initializer_range
+
+
+class FlavaConfig(PretrainedConfig):
+    r"""
+    [`FlavaConfig`] is the configuration class to store the configuration of a [`FlavaModel`]. It is used to
+    instantiate FLAVA model according to the specified arguments, defining the text model, image model, image codebook
+    and multimodal model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the FLAVA [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaTextConfig`].
+        image_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaImageConfig`].
+        multimodal_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaMultimodalConfig`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and image projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original FLAVA/CLIP
+            implementation.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        ce_ignore_index (`int`, *optional*, defaults to -100):
+            Cross entropy index to ignore.
+        mim_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MIM (Masked Image Modeling) unimodal loss
+        mlm_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MLM (Masked Language Modeling) unimodal loss
+        global_contrastive_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to global contrastive cross-alignment loss.
+        itm_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to image-text matching multimodal loss.
+        mmm_image_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MMM loss's image part.
+        mmm_text_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MMM loss's text part.
+        global_backprop_contrastive (`bool`, *optional*, defaults to `True`):
+            Whether to use global backpropgation through all workers in contrastive loss.
+        skip_unmasked_multimodal_encoder (`bool`, *optional*, defaults to `True`):
+            Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses.
+        return_loss (`bool`, *optional*, defaults to `True`):
+            Whether to return loss or not
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaConfig, FlavaModel, FlavaForPreTraining
+
+    >>> # Initializing a FlavaConfig with style configuration
+    >>> configuration = FlavaConfig()
+
+    >>> # Initializing a FlavaModel and FlavaForPreTraining model (with random weights) from the style configuration
+    >>> model = FlavaModel(configuration)
+    >>> model_pre = FlavaForPreTraining(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> configuration_pre = model_pre.config
+    ```
+    """
+
+    model_type = "flava"
+    sub_configs = {
+        "text_config": FlavaTextConfig,
+        "image_config": FlavaImageConfig,
+        "multimodal_config": FlavaMultimodalConfig,
+        "image_codebook_config": FlavaImageCodebookConfig,
+    }
+
+    def __init__(
+        self,
+        image_config: Optional[dict[str, Any]] = None,
+        text_config: Optional[dict[str, Any]] = None,
+        multimodal_config: Optional[dict[str, Any]] = None,
+        image_codebook_config: Optional[dict[str, Any]] = None,
+        hidden_size: int = 768,
+        layer_norm_eps: float = 1e-12,
+        projection_dim: int = 768,
+        init_codebook: bool = True,
+        logit_scale_init_value: float = 2.6592,
+        initializer_range: float = 0.02,
+        ce_ignore_index: int = -100,
+        mim_weight: float = 1.0,
+        mlm_weight: float = 1.0,
+        global_contrastive_weight: float = 1.0,
+        itm_weight: float = 1.0,
+        mmm_image_weight: float = 1.0,
+        mmm_text_weight: float = 1.0,
+        global_backprop_contrastive: bool = True,
+        skip_unmasked_multimodal_encoder: bool = True,
+        return_loss: bool = True,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        image_config_dict = kwargs.pop("image_config_dict", None)
+        multimodal_config_dict = kwargs.pop("multimodal_config_dict", None)
+        image_codebook_config_dict = kwargs.pop("image_codebook_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = FlavaTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if image_config_dict is not None:
+            if image_config is None:
+                image_config = {}
+
+            # This is the complete result when using `image_config_dict`.
+            _image_config_dict = FlavaImageConfig(**image_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _image_config_dict:
+                _image_config_dict["id2label"] = {
+                    str(key): value for key, value in _image_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_image_config_dict` and `image_config` but being different.
+            for key, value in _image_config_dict.items():
+                if key in image_config and value != image_config[key] and key not in ["transformers_version"]:
+                    # If specified in `image_config_dict`
+                    if key in image_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_config_dict` and `image_config` but with different "
+                            f'values. The value `image_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_config_dict` is provided which will be used to initialize `FlavaImageConfig`. "
+                            f'The value `image_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `image_config` with the ones in `_image_config_dict`.
+            image_config.update(_image_config_dict)
+
+        if multimodal_config_dict is not None:
+            if multimodal_config is None:
+                multimodal_config = {}
+
+            # This is the complete result when using `multimodal_config_dict`.
+            _multimodal_config_dict = FlavaMultimodalConfig(**multimodal_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_multimodal_config_dict` and `multimodal_config` but being
+            # different.
+            for key, value in _multimodal_config_dict.items():
+                if (
+                    key in multimodal_config
+                    and value != multimodal_config[key]
+                    and key not in ["transformers_version"]
+                ):
+                    # If specified in `multimodal_config_dict`
+                    if key in multimodal_config_dict:
+                        message = (
+                            f"`{key}` is found in both `multimodal_config_dict` and `multimodal_config` but with "
+                            f'different values. The value `multimodal_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`multimodal_config_dict` is provided which will be used to initialize "
+                            f'`FlavaMultimodalConfig`. The value `multimodal_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `multimodal_config` with the ones in `_multimodal_config_dict`.
+            multimodal_config.update(_multimodal_config_dict)
+
+        if image_codebook_config_dict is not None:
+            if image_codebook_config is None:
+                image_codebook_config = {}
+
+            # This is the complete result when using `image_codebook_config_dict`.
+            _image_codebook_config_dict = FlavaImageCodebookConfig(**image_codebook_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_image_codebook_config_dict` and `image_codebook_config` but
+            # being different.
+            for key, value in _image_codebook_config_dict.items():
+                if (
+                    key in image_codebook_config
+                    and value != image_codebook_config[key]
+                    and key not in ["transformers_version"]
+                ):
+                    # If specified in `image_codebook_config_dict`
+                    if key in image_codebook_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_codebook_config_dict` and `image_codebook_config` but "
+                            f'with different values. The value `image_codebook_config_dict["{key}"]` will be used '
+                            "instead."
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_codebook_config_dict` is provided which will be used to initialize "
+                            f'`FlavaImageCodebookConfig`. The value `image_codebook_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `image_codebook_config` with the ones in `_image_codebook_config_dict`.
+            image_codebook_config.update(_image_codebook_config_dict)
+
+        if image_config is None:
+            image_config = {}
+            logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.")
+
+        if multimodal_config is None:
+            multimodal_config = {}
+            logger.info("`multimodal_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.")
+
+        if image_codebook_config is None:
+            image_codebook_config = {}
+            logger.info(
+                "`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values."
+            )
+
+        self.image_config = FlavaImageConfig(**image_config)
+        self.text_config = FlavaTextConfig(**text_config)
+        self.multimodal_config = FlavaMultimodalConfig(**multimodal_config)
+        self.image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config)
+        self.projection_dim = projection_dim
+        self.init_codebook = init_codebook
+
+        self.hidden_size = hidden_size
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+        self.ce_ignore_index = ce_ignore_index
+        self.mim_weight = mim_weight
+        self.mlm_weight = mlm_weight
+        self.global_contrastive_weight = global_contrastive_weight
+        self.itm_weight = itm_weight
+        self.mmm_image_weight = mmm_image_weight
+        self.mmm_text_weight = mmm_text_weight
+        self.global_backprop_contrastive = global_backprop_contrastive
+        self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder
+        self.return_loss = return_loss
+
+    @classmethod
+    def from_configs(
+        cls,
+        image_config: FlavaImageConfig,
+        text_config: FlavaTextConfig,
+        multimodal_config: FlavaMultimodalConfig,
+        image_codebook_config: FlavaImageCodebookConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model
+        configuration, flava multimodal model and flava codebook model configuration.
+
+        Returns:
+            [`FlavaConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            image_config=image_config.to_dict(),
+            text_config=text_config.to_dict(),
+            multimodal_config=multimodal_config.to_dict(),
+            image_codebook_config=image_codebook_config.to_dict(),
+            **kwargs,
+        )
+
+
+__all__ = ["FlavaConfig", "FlavaImageCodebookConfig", "FlavaImageConfig", "FlavaMultimodalConfig", "FlavaTextConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/feature_extraction_flava.py b/phivenv/Lib/site-packages/transformers/models/flava/feature_extraction_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..19bcccc889f546442af71229c998880fbbb2db31
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flava/feature_extraction_flava.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for FLAVA."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_flava import FlavaImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class FlavaFeatureExtractor(FlavaImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class FlavaFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use FlavaImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["FlavaFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/image_processing_flava.py b/phivenv/Lib/site-packages/transformers/models/flava/image_processing_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc8f47d87ce95a751c8ae6dff3b65969f2ee6331
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flava/image_processing_flava.py
@@ -0,0 +1,706 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Flava."""
+
+import math
+import random
+from collections.abc import Iterable
+from functools import lru_cache
+from typing import Any, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...utils.import_utils import requires
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+# These values are taken from CLIP
+FLAVA_IMAGE_MEAN = OPENAI_CLIP_MEAN
+FLAVA_IMAGE_STD = OPENAI_CLIP_STD
+FLAVA_CODEBOOK_MEAN = [0.0, 0.0, 0.0]
+FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
+LOGIT_LAPLACE_EPS: float = 0.1
+
+
+# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
+class FlavaMaskingGenerator:
+    def __init__(
+        self,
+        input_size: Union[int, tuple[int, int]] = 14,
+        total_mask_patches: int = 75,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_patches: int = 16,
+        mask_group_min_aspect_ratio: Optional[float] = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.total_mask_patches = total_mask_patches
+
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
+
+        mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
+        self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
+
+    def __repr__(self):
+        repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.mask_group_min_patches,
+            self.mask_group_max_patches,
+            self.total_mask_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _attempt in range(10):
+            target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            height = int(round(math.sqrt(target_area * aspect_ratio)))
+            width = int(round(math.sqrt(target_area / aspect_ratio)))
+            if width < self.width and height < self.height:
+                top = random.randint(0, self.height - height)
+                left = random.randint(0, self.width - width)
+
+                num_masked = mask[top : top + height, left : left + width].sum()
+                # Overlap
+                if 0 < height * width - num_masked <= max_mask_patches:
+                    for i in range(top, top + height):
+                        for j in range(left, left + width):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self):
+        mask = np.zeros(shape=self.get_shape(), dtype=int)
+        mask_count = 0
+        while mask_count < self.total_mask_patches:
+            max_mask_patches = self.total_mask_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
+
+
+@requires(backends=("vision",))
+class FlavaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Flava image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by the `size` parameter in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in
+            `preprocess`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the images. Can be overridden by the `do_center_crop` parameter in `preprocess`.
+        crop_size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of image after the center crop `(crop_size["height"], crop_size["width"])`. Can be overridden by the
+            `crop_size` parameter in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in
+            `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in `preprocess`.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        return_image_mask (`bool`, *optional*, defaults to `False`):
+            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+        input_size_patches (`int`, *optional*, defaults to 14):
+            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+            by the `input_size_patches` parameter in `preprocess`.
+        total_mask_patches (`int`, *optional*, defaults to 75):
+            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+            `preprocess`.
+        mask_group_min_patches (`int`, *optional*, defaults to 16):
+            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+            parameter in `preprocess`.
+        mask_group_max_patches (`int`, *optional*):
+            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+            parameter in `preprocess`.
+        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+            in `preprocess`.
+        mask_group_max_aspect_ratio (`float`, *optional*):
+            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+            in `preprocess`.
+        codebook_do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+            parameter in `preprocess`. `codebook_size`.
+        codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+            `preprocess`.
+        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+            parameter in `preprocess`.
+        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input for codebook at the center. If the input size is smaller than
+            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+        codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired output size for codebook input when applying center-cropping. Can be overridden by the
+            `codebook_crop_size` parameter in `preprocess`.
+        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+            overridden by the `codebook_do_rescale` parameter in `preprocess`.
+        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+            `codebook_rescale_factor` parameter in `preprocess`.
+        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+            `codebook_do_map_pixels` parameter in `preprocess`.
+        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+            by the `codebook_image_mean` parameter in `preprocess`.
+        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+            be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        # Mask related params
+        return_image_mask: bool = False,
+        input_size_patches: int = 14,
+        total_mask_patches: int = 75,
+        mask_group_min_patches: int = 16,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_aspect_ratio: float = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+        # Codebook related params
+        return_codebook_pixels: bool = False,
+        codebook_do_resize: bool = True,
+        codebook_size: Optional[bool] = None,
+        codebook_resample: int = PILImageResampling.LANCZOS,
+        codebook_do_center_crop: bool = True,
+        codebook_crop_size: Optional[int] = None,
+        codebook_do_rescale: bool = True,
+        codebook_rescale_factor: Union[int, float] = 1 / 255,
+        codebook_do_map_pixels: bool = True,
+        codebook_do_normalize: bool = True,
+        codebook_image_mean: Optional[Union[float, Iterable[float]]] = None,
+        codebook_image_std: Optional[Union[float, Iterable[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112}
+        codebook_size = get_size_dict(codebook_size, param_name="codebook_size")
+        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112}
+        codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else FLAVA_IMAGE_MEAN
+        self.image_std = image_std if image_std is not None else FLAVA_IMAGE_STD
+
+        self.return_image_mask = return_image_mask
+        self.input_size_patches = input_size_patches
+        self.total_mask_patches = total_mask_patches
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = mask_group_max_patches
+        self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
+        self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
+
+        self.return_codebook_pixels = return_codebook_pixels
+        self.codebook_do_resize = codebook_do_resize
+        self.codebook_size = codebook_size
+        self.codebook_resample = codebook_resample
+        self.codebook_do_center_crop = codebook_do_center_crop
+        self.codebook_crop_size = codebook_crop_size
+        self.codebook_do_rescale = codebook_do_rescale
+        self.codebook_rescale_factor = codebook_rescale_factor
+        self.codebook_do_map_pixels = codebook_do_map_pixels
+        self.codebook_do_normalize = codebook_do_normalize
+        self.codebook_image_mean = codebook_image_mean
+        self.codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else FLAVA_CODEBOOK_MEAN
+        self.codebook_image_std = codebook_image_std if codebook_image_std is not None else FLAVA_CODEBOOK_STD
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "codebook_size" in kwargs:
+            image_processor_dict["codebook_size"] = kwargs.pop("codebook_size")
+        if "codebook_crop_size" in kwargs:
+            image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    @lru_cache
+    def masking_generator(
+        self,
+        input_size_patches,
+        total_mask_patches,
+        mask_group_min_patches,
+        mask_group_max_patches,
+        mask_group_min_aspect_ratio,
+        mask_group_max_aspect_ratio,
+    ) -> FlavaMaskingGenerator:
+        return FlavaMaskingGenerator(
+            input_size=input_size_patches,
+            total_mask_patches=total_mask_patches,
+            mask_group_min_patches=mask_group_min_patches,
+            mask_group_max_patches=mask_group_max_patches,
+            mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+            mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+        )
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def map_pixels(self, image: np.ndarray) -> np.ndarray:
+        return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_map_pixels: Optional[bool] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[ChannelDimension] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+        if do_map_pixels:
+            image = self.map_pixels(image)
+
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        # Mask related params
+        return_image_mask: Optional[bool] = None,
+        input_size_patches: Optional[int] = None,
+        total_mask_patches: Optional[int] = None,
+        mask_group_min_patches: Optional[int] = None,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_aspect_ratio: Optional[float] = None,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+        # Codebook related params
+        return_codebook_pixels: Optional[bool] = None,
+        codebook_do_resize: Optional[bool] = None,
+        codebook_size: Optional[dict[str, int]] = None,
+        codebook_resample: Optional[int] = None,
+        codebook_do_center_crop: Optional[bool] = None,
+        codebook_crop_size: Optional[dict[str, int]] = None,
+        codebook_do_rescale: Optional[bool] = None,
+        codebook_rescale_factor: Optional[float] = None,
+        codebook_do_map_pixels: Optional[bool] = None,
+        codebook_do_normalize: Optional[bool] = None,
+        codebook_image_mean: Optional[Iterable[float]] = None,
+        codebook_image_std: Optional[Iterable[float]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_image_mask (`bool`, *optional*, defaults to `self.return_image_mask`):
+                Whether to return the image mask.
+            input_size_patches (`int`, *optional*, defaults to `self.input_size_patches`):
+                Size of the patches to extract from the image.
+            total_mask_patches (`int`, *optional*, defaults to `self.total_mask_patches`):
+                Total number of patches to extract from the image.
+            mask_group_min_patches (`int`, *optional*, defaults to `self.mask_group_min_patches`):
+                Minimum number of patches to extract from the image.
+            mask_group_max_patches (`int`, *optional*, defaults to `self.mask_group_max_patches`):
+                Maximum number of patches to extract from the image.
+            mask_group_min_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_min_aspect_ratio`):
+                Minimum aspect ratio of the patches to extract from the image.
+            mask_group_max_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_max_aspect_ratio`):
+                Maximum aspect ratio of the patches to extract from the image.
+            return_codebook_pixels (`bool`, *optional*, defaults to `self.return_codebook_pixels`):
+                Whether to return the codebook pixels.
+            codebook_do_resize (`bool`, *optional*, defaults to `self.codebook_do_resize`):
+                Whether to resize the codebook pixels.
+            codebook_size (`dict[str, int]`, *optional*, defaults to `self.codebook_size`):
+                Size of the codebook pixels.
+            codebook_resample (`int`, *optional*, defaults to `self.codebook_resample`):
+                Resampling filter to use if resizing the codebook pixels. This can be one of the enum
+                `PILImageResampling`, Only has an effect if `codebook_do_resize` is set to `True`.
+            codebook_do_center_crop (`bool`, *optional*, defaults to `self.codebook_do_center_crop`):
+                Whether to center crop the codebook pixels.
+            codebook_crop_size (`dict[str, int]`, *optional*, defaults to `self.codebook_crop_size`):
+                Size of the center crop of the codebook pixels. Only has an effect if `codebook_do_center_crop` is set
+                to `True`.
+            codebook_do_rescale (`bool`, *optional*, defaults to `self.codebook_do_rescale`):
+                Whether to rescale the codebook pixels values between [0 - 1].
+            codebook_rescale_factor (`float`, *optional*, defaults to `self.codebook_rescale_factor`):
+                Rescale factor to rescale the codebook pixels by if `codebook_do_rescale` is set to `True`.
+            codebook_do_map_pixels (`bool`, *optional*, defaults to `self.codebook_do_map_pixels`):
+                Whether to map the codebook pixels values.
+            codebook_do_normalize (`bool`, *optional*, defaults to `self.codebook_do_normalize`):
+                Whether to normalize the codebook pixels.
+            codebook_image_mean (`float` or `list[float]`, *optional*, defaults to `self.codebook_image_mean`):
+                Codebook pixels mean to normalize the codebook pixels by if `codebook_do_normalize` is set to `True`.
+            codebook_image_std (`float` or `list[float]`, *optional*, defaults to `self.codebook_image_std`):
+                Codebook pixels standard deviation to normalize the codebook pixels by if `codebook_do_normalize` is
+                set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        return_image_mask = return_image_mask if return_image_mask is not None else self.return_image_mask
+        input_size_patches = input_size_patches if input_size_patches is not None else self.input_size_patches
+        total_mask_patches = total_mask_patches if total_mask_patches is not None else self.total_mask_patches
+        mask_group_min_patches = (
+            mask_group_min_patches if mask_group_min_patches is not None else self.mask_group_min_patches
+        )
+        mask_group_max_patches = (
+            mask_group_max_patches if mask_group_max_patches is not None else self.mask_group_max_patches
+        )
+        mask_group_min_aspect_ratio = (
+            mask_group_min_aspect_ratio
+            if mask_group_min_aspect_ratio is not None
+            else self.mask_group_min_aspect_ratio
+        )
+        mask_group_max_aspect_ratio = (
+            mask_group_max_aspect_ratio
+            if mask_group_max_aspect_ratio is not None
+            else self.mask_group_max_aspect_ratio
+        )
+
+        return_codebook_pixels = (
+            return_codebook_pixels if return_codebook_pixels is not None else self.return_codebook_pixels
+        )
+        codebook_do_resize = codebook_do_resize if codebook_do_resize is not None else self.codebook_do_resize
+        codebook_size = codebook_size if codebook_size is not None else self.codebook_size
+        codebook_size = get_size_dict(codebook_size, param_name="codebook_size")
+        codebook_resample = codebook_resample if codebook_resample is not None else self.codebook_resample
+        codebook_do_rescale = codebook_do_rescale if codebook_do_rescale is not None else self.codebook_do_rescale
+        codebook_rescale_factor = (
+            codebook_rescale_factor if codebook_rescale_factor is not None else self.codebook_rescale_factor
+        )
+        codebook_do_center_crop = (
+            codebook_do_center_crop if codebook_do_center_crop is not None else self.codebook_do_center_crop
+        )
+        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else self.codebook_crop_size
+        codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size")
+        codebook_do_map_pixels = (
+            codebook_do_map_pixels if codebook_do_map_pixels is not None else self.codebook_do_map_pixels
+        )
+        codebook_do_normalize = (
+            codebook_do_normalize if codebook_do_normalize is not None else self.codebook_do_normalize
+        )
+        codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else self.codebook_image_mean
+        codebook_image_std = codebook_image_std if codebook_image_std is not None else self.codebook_image_std
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        processed_images = [
+            self._preprocess_image(
+                image=img,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_center_crop=do_center_crop,
+                crop_size=crop_size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_map_pixels=False,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for img in images
+        ]
+        data = {"pixel_values": processed_images}
+
+        if return_codebook_pixels:
+            codebook_images = [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=codebook_do_resize,
+                    size=codebook_size,
+                    resample=codebook_resample,
+                    do_center_crop=codebook_do_center_crop,
+                    crop_size=codebook_crop_size,
+                    do_rescale=codebook_do_rescale,
+                    rescale_factor=codebook_rescale_factor,
+                    do_normalize=codebook_do_normalize,
+                    image_mean=codebook_image_mean,
+                    image_std=codebook_image_std,
+                    do_map_pixels=codebook_do_map_pixels,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in images
+            ]
+            data["codebook_pixel_values"] = codebook_images
+
+        if return_image_mask:
+            mask_generator = self.masking_generator(
+                input_size_patches=input_size_patches,
+                total_mask_patches=total_mask_patches,
+                mask_group_min_patches=mask_group_min_patches,
+                mask_group_max_patches=mask_group_max_patches,
+                mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+                mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+            )
+            masks = [mask_generator() for _ in images]
+            data["bool_masked_pos"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["FlavaImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/image_processing_flava_fast.py b/phivenv/Lib/site-packages/transformers/models/flava/image_processing_flava_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dcc5326d968c9276cf8b454c1224660a230646e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flava/image_processing_flava_fast.py
@@ -0,0 +1,503 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Flava."""
+
+import math
+import random
+from collections.abc import Iterable
+from functools import lru_cache
+from typing import Any, Optional, Union
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    get_size_dict,
+)
+from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
+from ...image_utils import ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+from .image_processing_flava import (
+    FLAVA_CODEBOOK_MEAN,
+    FLAVA_CODEBOOK_STD,
+    FLAVA_IMAGE_MEAN,
+    FLAVA_IMAGE_STD,
+    LOGIT_LAPLACE_EPS,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class FlavaMaskingGenerator:
+    def __init__(
+        self,
+        input_size: Union[int, tuple[int, int]] = 14,
+        total_mask_patches: int = 75,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_patches: int = 16,
+        mask_group_min_aspect_ratio: Optional[float] = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.total_mask_patches = total_mask_patches
+
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
+
+        mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
+        self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
+
+    def __repr__(self):
+        repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.mask_group_min_patches,
+            self.mask_group_max_patches,
+            self.total_mask_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _attempt in range(10):
+            target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            height = int(round(math.sqrt(target_area * aspect_ratio)))
+            width = int(round(math.sqrt(target_area / aspect_ratio)))
+            if width < self.width and height < self.height:
+                top = random.randint(0, self.height - height)
+                left = random.randint(0, self.width - width)
+
+                num_masked = mask[top : top + height, left : left + width].sum()
+                # Overlap
+                if 0 < height * width - num_masked <= max_mask_patches:
+                    zeros_pos = mask[top : top + height, left : left + width] == 0
+                    mask[top : top + height, left : left + width][zeros_pos] = 1
+                    delta += zeros_pos.sum()
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self):
+        mask = torch.zeros(self.get_shape(), dtype=torch.int)
+        mask_count = 0
+        while mask_count < self.total_mask_patches:
+            max_mask_patches = self.total_mask_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
+
+
+class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    Args:
+        return_image_mask (`bool`, *optional*, defaults to `False`):
+            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+        input_size_patches (`int`, *optional*, defaults to 14):
+            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+            by the `input_size_patches` parameter in `preprocess`.
+        total_mask_patches (`int`, *optional*, defaults to 75):
+            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+            `preprocess`.
+        mask_group_min_patches (`int`, *optional*, defaults to 16):
+            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+            parameter in `preprocess`.
+        mask_group_max_patches (`int`, *optional*):
+            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+            parameter in `preprocess`.
+        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+            in `preprocess`.
+        mask_group_max_aspect_ratio (`float`, *optional*):
+            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+            in `preprocess`.
+        return_codebook_pixels (`bool`, *optional*, defaults to `False`):
+            Whether to return the codebook pixel values.
+        codebook_do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+            parameter in `preprocess`. `codebook_size`.
+        codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+            `preprocess`.
+        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+            parameter in `preprocess`.
+        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input for codebook at the center. If the input size is smaller than
+            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+        codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired output size for codebook input when applying center-cropping. Can be overridden by the
+            `codebook_crop_size` parameter in `preprocess`.
+        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+            overridden by the `codebook_do_rescale` parameter in `preprocess`.
+        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+            `codebook_rescale_factor` parameter in `preprocess`.
+        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+            `codebook_do_map_pixels` parameter in `preprocess`.
+        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+            by the `codebook_image_mean` parameter in `preprocess`.
+        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+            be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """
+
+    # Mask related params
+    return_image_mask: Optional[bool]
+    input_size_patches: Optional[int]
+    total_mask_patches: Optional[int]
+    mask_group_min_patches: Optional[int]
+    mask_group_max_patches: Optional[int]
+    mask_group_min_aspect_ratio: Optional[float]
+    mask_group_max_aspect_ratio: Optional[float]
+    # Codebook related params
+    return_codebook_pixels: Optional[bool]
+    codebook_do_resize: Optional[bool]
+    codebook_size: Optional[bool]
+    codebook_resample: Optional[int]
+    codebook_do_center_crop: Optional[bool]
+    codebook_crop_size: Optional[int]
+    codebook_do_rescale: Optional[bool]
+    codebook_rescale_factor: Optional[Union[int, float]]
+    codebook_do_map_pixels: Optional[bool]
+    codebook_do_normalize: Optional[bool]
+    codebook_image_mean: Optional[Union[float, Iterable[float]]]
+    codebook_image_std: Optional[Union[float, Iterable[float]]]
+
+
+@auto_docstring
+class FlavaImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = FLAVA_IMAGE_MEAN
+    image_std = FLAVA_IMAGE_STD
+    size = {"height": 224, "width": 224}
+    crop_size = {"height": 224, "width": 224}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+
+    # Mask related params
+    return_image_mask = False
+    input_size_patches = 14
+    total_mask_patches = 75
+    mask_group_min_patches = 16
+    mask_group_max_patches = None
+    mask_group_min_aspect_ratio = 0.3
+    mask_group_max_aspect_ratio = None
+    # Codebook related params
+    return_codebook_pixels = False
+    codebook_do_resize = True
+    codebook_size = {"height": 112, "width": 112}
+    # LANCZOS resample does not support torch Tensor. Use BICUBIC as closest alternative
+    codebook_resample = PILImageResampling.BICUBIC
+    codebook_do_center_crop = True
+    codebook_crop_size = {"height": 112, "width": 112}
+    codebook_do_rescale = True
+    codebook_rescale_factor = 1 / 255
+    codebook_do_map_pixels = True
+    codebook_do_normalize = True
+    codebook_image_mean = FLAVA_CODEBOOK_MEAN
+    codebook_image_std = FLAVA_CODEBOOK_STD
+    valid_kwargs = FlavaFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "codebook_size" in kwargs:
+            image_processor_dict["codebook_size"] = kwargs.pop("codebook_size")
+        if "codebook_crop_size" in kwargs:
+            image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    @lru_cache
+    def masking_generator(
+        self,
+        input_size_patches,
+        total_mask_patches,
+        mask_group_min_patches,
+        mask_group_max_patches,
+        mask_group_min_aspect_ratio,
+        mask_group_max_aspect_ratio,
+    ) -> FlavaMaskingGenerator:
+        return FlavaMaskingGenerator(
+            input_size=input_size_patches,
+            total_mask_patches=total_mask_patches,
+            mask_group_min_patches=mask_group_min_patches,
+            mask_group_max_patches=mask_group_max_patches,
+            mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+            mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+        )
+
+    def map_pixels(self, image: "torch.Tensor") -> "torch.Tensor":
+        return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
+
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        crop_size: Optional[SizeDict] = None,
+        default_to_square: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        codebook_size: Optional[SizeDict] = None,
+        codebook_crop_size: Optional[SizeDict] = None,
+        codebook_image_mean: Optional[Union[float, list[float]]] = None,
+        codebook_image_std: Optional[Union[float, list[float]]] = None,
+        codebook_resample: Optional[PILImageResampling] = None,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if kwargs is None:
+            kwargs = {}
+        if size is not None:
+            size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
+        if crop_size is not None:
+            crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size"))
+        if isinstance(image_mean, list):
+            image_mean = tuple(image_mean)
+        if isinstance(image_std, list):
+            image_std = tuple(image_std)
+        if data_format is None:
+            data_format = ChannelDimension.FIRST
+        if codebook_size is not None:
+            codebook_size = SizeDict(**get_size_dict(size=codebook_size, default_to_square=default_to_square))
+        if codebook_crop_size is not None:
+            codebook_crop_size = SizeDict(**get_size_dict(codebook_crop_size, param_name="codebook_crop_size"))
+        if isinstance(codebook_image_mean, list):
+            codebook_image_mean = tuple(codebook_image_mean)
+        if isinstance(codebook_image_std, list):
+            codebook_image_std = tuple(codebook_image_std)
+
+        kwargs["size"] = size
+        kwargs["crop_size"] = crop_size
+        kwargs["image_mean"] = image_mean
+        kwargs["image_std"] = image_std
+        kwargs["codebook_size"] = codebook_size
+        kwargs["codebook_crop_size"] = codebook_crop_size
+        kwargs["codebook_image_mean"] = codebook_image_mean
+        kwargs["codebook_image_std"] = codebook_image_std
+        kwargs["data_format"] = data_format
+        kwargs["codebook_interpolation"] = (
+            pil_torch_interpolation_mapping[codebook_resample]
+            if isinstance(codebook_resample, (PILImageResampling, int))
+            else codebook_resample
+        )
+
+        # torch resize uses interpolation instead of resample
+        # Check if resample is an int before checking if it's an instance of PILImageResampling
+        # because if pillow < 9.1.0, resample is an int and PILImageResampling is a module.
+        # Checking PILImageResampling will fail with error `TypeError: isinstance() arg 2 must be a type or tuple of types`.
+        resample = kwargs.pop("resample")
+        kwargs["interpolation"] = (
+            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
+        )
+
+        return kwargs
+
+    def _preprocess_image(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_map_pixels: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> "torch.Tensor":
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            if do_map_pixels:
+                stacked_images = self.map_pixels(image=stacked_images)
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return processed_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        # Mask related params
+        return_image_mask: Optional[bool],
+        input_size_patches: Optional[int],
+        total_mask_patches: Optional[int],
+        mask_group_min_patches: Optional[int],
+        mask_group_max_patches: Optional[int],
+        mask_group_min_aspect_ratio: Optional[float],
+        mask_group_max_aspect_ratio: Optional[float],
+        # Codebook related params
+        return_codebook_pixels: Optional[bool],
+        codebook_do_resize: Optional[bool],
+        codebook_size: Optional[SizeDict],
+        codebook_interpolation: Optional["F.InterpolationMode"],
+        codebook_do_center_crop: Optional[bool],
+        codebook_crop_size: Optional[SizeDict],
+        codebook_do_rescale: Optional[bool],
+        codebook_rescale_factor: Optional[float],
+        codebook_do_map_pixels: Optional[bool],
+        codebook_do_normalize: Optional[bool],
+        codebook_image_mean: Optional[Union[float, list[float]]],
+        codebook_image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        processed_images = self._preprocess_image(
+            images=images,
+            do_resize=do_resize,
+            size=size,
+            interpolation=interpolation,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            do_map_pixels=False,
+            image_mean=image_mean,
+            image_std=image_std,
+            disable_grouping=disable_grouping,
+            return_tensors=return_tensors,
+        )
+        data = {
+            "pixel_values": processed_images,
+        }
+
+        if return_codebook_pixels:
+            codebook_processed_images = self._preprocess_image(
+                images=images,
+                do_resize=codebook_do_resize,
+                size=codebook_size,
+                interpolation=codebook_interpolation,
+                do_center_crop=codebook_do_center_crop,
+                crop_size=codebook_crop_size,
+                do_rescale=codebook_do_rescale,
+                rescale_factor=codebook_rescale_factor,
+                do_normalize=codebook_do_normalize,
+                do_map_pixels=codebook_do_map_pixels,
+                image_mean=codebook_image_mean,
+                image_std=codebook_image_std,
+                disable_grouping=disable_grouping,
+                return_tensors=return_tensors,
+            )
+            data["codebook_pixel_values"] = codebook_processed_images
+
+        if return_image_mask:
+            mask_generator = self.masking_generator(
+                input_size_patches=input_size_patches,
+                total_mask_patches=total_mask_patches,
+                mask_group_min_patches=mask_group_min_patches,
+                mask_group_max_patches=mask_group_max_patches,
+                mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+                mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+            )
+            masks = [mask_generator() for _ in range(len(images))]
+            masks = torch.stack(masks, dim=0) if return_tensors else masks
+            data["bool_masked_pos"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["FlavaImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/modeling_flava.py b/phivenv/Lib/site-packages/transformers/models/flava/modeling_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..cafd6e589adf885539c6c7016548ec325e799c99
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flava/modeling_flava.py
@@ -0,0 +1,2040 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FLAVA model."""
+
+import collections
+import math
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from .configuration_flava import (
+    FlavaConfig,
+    FlavaImageCodebookConfig,
+    FlavaImageConfig,
+    FlavaMultimodalConfig,
+    FlavaTextConfig,
+)
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook"
+
+LOGIT_SCALE_CLAMP_MIN = 0
+LOGIT_SCALE_CLAMP_MAX = 4.6052
+
+FlavaPossibleConfigs = Union[FlavaTextConfig, FlavaImageConfig, FlavaMultimodalConfig]
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output from FlavaModel containing embeddings and outputs from individual encoders.
+
+    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
+    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
+    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
+    """
+)
+class FlavaModelOutput(ModelOutput):
+    r"""
+    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
+    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+        The output of the [`FlavaImageModel`].
+    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
+        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
+        The output of the [`FlavaTextModel`].
+    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
+        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
+        The output of the [`FlavaMultimodalModel`].
+    """
+
+    image_embeddings: Optional[torch.FloatTensor] = None
+    image_output: Optional[BaseModelOutputWithPooling] = None
+    text_embeddings: Optional[torch.FloatTensor] = None
+    text_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_output: Optional[BaseModelOutputWithPooling] = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_output", "image_output", "multimodal_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class representing pretraining losses from FLAVA model
+    """
+)
+class FlavaLosses(ModelOutput):
+    r"""
+    mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.):
+        Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
+    mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.):
+        Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
+    itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.):
+        Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
+        masked pairs in FLAVA.
+    global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.):
+        Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
+        data. This is calculated on unmasked images and texts.
+    mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.):
+        Masked Multimodal Modeling loss's image component calculated on paired image-text data.
+    mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.):
+        Masked Multimodal Modeling loss's text component calculated on paired image-text data.
+    """
+
+    mim: Optional[torch.FloatTensor] = None
+    mlm: Optional[torch.FloatTensor] = None
+    itm: Optional[torch.FloatTensor] = None
+    global_contrastive: Optional[torch.FloatTensor] = None
+    mmm_image: Optional[torch.FloatTensor] = None
+    mmm_text: Optional[torch.FloatTensor] = None
+
+    def all_none(self) -> bool:
+        all_none = True
+        for v in self.values():
+            if v is not None:
+                all_none = False
+                break
+        return all_none
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.
+
+    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
+    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
+    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
+    """
+)
+class FlavaForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
+        Total loss calculated for this model.
+    loss_info (`FlavaLosses`):
+        Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
+        the keys.
+    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
+    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+        The output of the [`FlavaImageModel`].
+    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
+        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
+        The output of the [`FlavaTextModel`].
+    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
+        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
+        The output of the [`FlavaMultimodalModel`].
+    image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+        The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
+        to create masked images.
+    image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+        The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
+    text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
+        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+    text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
+        The output of the [`FlavaTextModel`].
+    multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
+        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+    multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
+        The output of the [`FlavaMultimodalModel`].
+    mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
+        The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
+            returned when `bool_masked_pos` has some of the patches masked.
+    mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
+        The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
+            the tokens masked.
+    itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
+        The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
+    contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+        The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
+        `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
+        scores. This is calculated on unmasked images and texts.
+    contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+        The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
+        `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
+        texts.
+    mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
+        The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
+            output is returned when `bool_masked_pos` has some of the patches masked.
+    mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
+        The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
+            some of the tokens masked.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_info: FlavaLosses = None
+    image_embeddings: Optional[torch.FloatTensor] = None
+    image_output: Optional[BaseModelOutputWithPooling] = None
+    text_embeddings: Optional[torch.FloatTensor] = None
+    text_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_output: Optional[BaseModelOutputWithPooling] = None
+    image_masked_embeddings: Optional[torch.FloatTensor] = None
+    image_masked_output: Optional[BaseModelOutputWithPooling] = None
+    text_masked_embeddings: Optional[torch.FloatTensor] = None
+    text_masked_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_masked_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_masked_output: Optional[BaseModelOutputWithPooling] = None
+    mim_logits: Optional[torch.FloatTensor] = None
+    mlm_logits: Optional[torch.FloatTensor] = None
+    itm_logits: Optional[torch.FloatTensor] = None
+    contrastive_logits_per_image: Optional[torch.FloatTensor] = None
+    contrastive_logits_per_text: Optional[torch.FloatTensor] = None
+    mmm_image_logits: Optional[torch.FloatTensor] = None
+    mmm_text_logits: Optional[torch.FloatTensor] = None
+
+    def to_tuple(self) -> tuple[Any]:
+        transformer_outputs = [
+            "text_output",
+            "image_output",
+            "multimodal_output",
+            "text_masked_output",
+            "image_masked_output",
+            "multimodal_masked_output",
+        ]
+        return tuple(self[k] if k not in transformer_outputs else getattr(self, k).to_tuple() for k in self.keys())
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py
+class FlavaImageEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: FlavaImageConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        use_mask_token = use_mask_token or config.mask_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = PatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        batch_size, seq_len, _ = embeddings.size()
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # B X H X W = B X HW
+            if bool_masked_pos.dim() == 3:
+                bool_masked_pos = bool_masked_pos.view(bool_masked_pos.size(0), -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py
+class PatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        patch_size: Union[int, tuple[int, int]] = 16,
+        num_channels: int = 3,
+        embed_dim: int = 768,
+    ):
+        super().__init__()
+        if not isinstance(image_size, collections.abc.Iterable):
+            image_size = (image_size, image_size)
+        if not isinstance(patch_size, collections.abc.Iterable):
+            patch_size = (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+
+
+class FlavaTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        input_shape = input_ids.size()
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class FlavaSelfAttention(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class FlavaSelfOutput(nn.Module):
+    """
+    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
+    models), due to the layernorm applied before each block.
+    """
+
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class FlavaAttention(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.attention = FlavaSelfAttention(config)
+        self.output = FlavaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_outputs = self.attention(
+            hidden_states, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class FlavaIntermediate(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    # Copied from transformers.models.vit.modeling_vit.ViTIntermediate.forward
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class FlavaOutput(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    # Copied from transformers.models.vit.modeling_vit.ViTOutput.forward
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class FlavaLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = FlavaAttention(config)
+        self.intermediate = FlavaIntermediate(config)
+        self.output = FlavaOutput(config)
+
+        # TODO: Check fp32 layer norm possibility
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class FlavaEncoder(nn.Module):
+    def __init__(self, config: FlavaConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([FlavaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class FlavaPooler(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class FlavaPreTrainedModel(PreTrainedModel):
+    config: FlavaConfig
+    base_model_prefix = "flava"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, FlavaMaskedPredictionHead):
+            module.bias.data.zero_()
+        elif isinstance(module, FlavaImageEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+        elif isinstance(module, FlavaMultimodalModel):
+            if module.use_cls_token:
+                module.cls_token.data.zero_()
+        elif isinstance(module, FlavaModel):
+            module.logit_scale.data.fill_(self.config.logit_scale_init_value)
+
+
+@auto_docstring
+class FlavaImageModel(FlavaPreTrainedModel):
+    config: FlavaImageConfig
+    # This override allows us to load FlavaImageModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.image_model"
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: FlavaImageConfig, add_pooling_layer: bool = True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+
+        self.config = config
+
+        self.embeddings = FlavaImageEmbeddings(config)
+        self.encoder = FlavaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embeddings
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.embeddings.patch_embeddings = value
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FlavaTextModel(FlavaPreTrainedModel):
+    config: FlavaTextConfig
+    # This override allows us to load FlavaTextModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.text_model"
+
+    def __init__(self, config: FlavaTextConfig, add_pooling_layer: bool = True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = FlavaTextEmbeddings(config)
+        self.encoder = FlavaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+
+        self.post_init()
+
+    def get_input_embeddings(self) -> PatchEmbeddings:
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=input_ids.device)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, input_ids.device
+        )
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FlavaMultimodalModel(FlavaPreTrainedModel):
+    config: FlavaMultimodalConfig
+    # This override allows us to load FlavaMultimodalModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.multimodal_model"
+    main_input_name = "hidden_states"
+
+    def __init__(self, config: FlavaMultimodalConfig, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.use_cls_token = self.config.use_cls_token
+        if self.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        self.encoder = FlavaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
+            The concatenated hidden states of unimodal encoders.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length, _ = hidden_states.size()
+
+        if self.use_cls_token:
+            cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+            hidden_states = torch.cat((cls_tokens, hidden_states), dim=1)
+            seq_length += 1
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length), device=hidden_states.device)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, (batch_size, seq_length), hidden_states.device
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FlavaModel(FlavaPreTrainedModel):
+    config: FlavaConfig
+
+    def __init__(self, config: FlavaConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, FlavaTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type FlavaTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.image_config, FlavaImageConfig):
+            raise TypeError(
+                "config.image_config is expected to be of type FlavaImageConfig but is of type"
+                f" {type(config.image_config)}."
+            )
+
+        if not isinstance(config.multimodal_config, FlavaMultimodalConfig):
+            raise TypeError(
+                "config.multimodal_config is expected to be of type FlavaMultimodalConfig but "
+                + f"is of type {type(config.multimodal_config)}."
+            )
+
+        text_config = config.text_config
+        image_config = config.image_config
+        multimodal_config = config.multimodal_config
+
+        self.projection_dim = config.projection_dim
+        self.text_hidden_size = text_config.hidden_size
+        self.image_hidden_size = image_config.hidden_size
+        self.mm_hidden_size = multimodal_config.hidden_size
+
+        self.text_model = FlavaTextModel(text_config)
+        self.image_model = FlavaImageModel(image_config)
+        self.multimodal_model = FlavaMultimodalModel(multimodal_config)
+
+        self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim)
+        self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size)
+        self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`FlavaTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, FlavaModel
+
+        >>> model = FlavaModel.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
+        ... )
+        >>> text_features = model.get_text_features(**inputs)
+        ```
+        """
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[0]  # last_hidden_state
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`FlavaImageModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, FlavaModel
+
+        >>> model = FlavaModel.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```
+        """
+        image_outputs = self.image_model(
+            pixel_values=pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        pooled_output = image_outputs[0]  # last_hidden_state
+        image_features = self.image_projection(pooled_output)
+
+        return image_features
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        skip_multimodal_encoder: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: bool = True,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FlavaOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*):
+            Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`:
+            - 1 for pixel values that are real (i.e., **not masked**),
+            - 0 for pixel values that are padding (i.e., **masked**).
+        skip_multimodal_encoder (*bool*, *optional*):
+            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, FlavaModel
+
+        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)
+
+        >>> outputs = model(**inputs)
+
+        >>> image_embeddings = outputs.image_embeddings
+        >>> text_embeddings = outputs.text_embeddings
+        >>> multimodal_embeddings = outputs.multimodal_embeddings
+
+        >>> outputs.image_embeddings.shape
+        torch.Size([1, 197, 768])
+
+        >>> text_embeddings.shape
+        torch.Size([1, 7, 768])
+
+        >>> multimodal_embeddings.shape
+        torch.Size([1, 205, 768])
+        ```
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if not output_hidden_states:
+            raise ValueError("FLAVA model requires hidden states to work. Please set `output_hidden_states=True`")
+        image_embeddings = None
+        image_states = None
+        image_mm_projection = None
+        image_output = None
+        if pixel_values is not None:
+            image_output = self.image_model(
+                pixel_values=pixel_values,
+                bool_masked_pos=bool_masked_pos,
+                attention_mask=image_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            image_embeddings, image_states = image_output[0], image_output[2]
+            # Note that these states don't use final layernorm in the transformer model
+            image_mm_projection = self.image_to_mm_projection(image_states[-1])
+
+        text_embeddings = None
+        text_states = None
+        text_mm_projection = None
+        text_output = None
+        if input_ids is not None:
+            text_output = self.text_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            text_embeddings, text_states = text_output[0], text_output[2]
+            # Note that these states don't use final layernorm in the transformer model
+            text_mm_projection = self.text_to_mm_projection(text_states[-1])
+
+        multimodal_embeddings = None
+        multimodal_output = None
+        if image_mm_projection is not None and text_mm_projection is not None and not skip_multimodal_encoder:
+            if attention_mask is not None:
+                batch_size, seq_len, _ = image_mm_projection.shape
+                if self.multimodal_model.use_cls_token:
+                    seq_len += 1
+                attention_mask_image = torch.ones(batch_size, seq_len, device=image_mm_projection.device)
+                attention_multimodal = torch.cat([attention_mask_image, attention_mask], dim=1)
+            else:
+                attention_multimodal = None
+            multimodal_input = torch.cat([image_mm_projection, text_mm_projection], dim=1)
+            multimodal_output = self.multimodal_model(
+                multimodal_input, attention_mask=attention_multimodal, return_dict=return_dict
+            )
+            multimodal_embeddings = multimodal_output[0]
+
+        if not return_dict:
+            return (
+                image_embeddings,
+                image_output,
+                text_embeddings,
+                text_output,
+                multimodal_embeddings,
+                multimodal_output,
+            )
+
+        return FlavaModelOutput(
+            image_embeddings=image_embeddings,
+            image_output=image_output,
+            text_embeddings=text_embeddings,
+            text_output=text_output,
+            multimodal_embeddings=multimodal_embeddings,
+            multimodal_output=multimodal_output,
+        )
+
+
+class FlavaImageCodebookResPath(nn.Module):
+    def __init__(self, in_size: int, out_size: int, **kwargs):
+        super().__init__()
+        hid_size = out_size // 4
+
+        path = OrderedDict()
+        path["relu_1"] = nn.ReLU()
+        path["conv_1"] = nn.Conv2d(in_size, hid_size, kernel_size=3, padding=1)
+        path["relu_2"] = nn.ReLU()
+        path["conv_2"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1)
+        path["relu_3"] = nn.ReLU()
+        path["conv_3"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1)
+        path["relu_4"] = nn.ReLU()
+        path["conv_4"] = nn.Conv2d(hid_size, out_size, kernel_size=1, padding=0)
+
+        self.path = nn.Sequential(path)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.path(x)
+
+
+class FlavaImageCodebookBlock(nn.Module):
+    def __init__(self, in_size: int, out_size: int, num_layers: int, **kwargs):
+        super().__init__()
+
+        self.post_gain = 1 / (num_layers**2)
+
+        if in_size != out_size:
+            self.id_path = nn.Conv2d(in_size, out_size, kernel_size=1, padding=0)
+        else:
+            self.id_path = nn.Identity()
+
+        self.res_path = FlavaImageCodebookResPath(in_size, out_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+
+
+class FlavaImageCodebookLayerGroup(nn.Module):
+    def __init__(self, num_blocks: int, num_layers: int, in_size: int, out_size: int, use_pool: bool = True):
+        super().__init__()
+        blocks = OrderedDict()
+        for i in range(num_blocks):
+            if i == 0:
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers)
+            else:
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers)
+
+        if use_pool:
+            blocks["pool"] = nn.MaxPool2d(kernel_size=2)
+
+        self.group = nn.Sequential(blocks)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.group(x)
+
+
+# Inspired by DALLE Encoder in https://github.com/openai/DALL-E/blob/5be4b236bc3ade6943662354117a0e83752cc322/dall_e/encoder.py#L42
+@auto_docstring(
+    custom_intro="""
+    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
+    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
+    `get_codebook_indices` to get image tokens for an image.
+    """
+)
+class FlavaImageCodebook(FlavaPreTrainedModel):
+    base_model_prefix = ""
+    config: FlavaImageCodebookConfig
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+
+    def __init__(
+        self,
+        config: FlavaImageCodebookConfig,
+        **kwargs: Any,
+    ):
+        super().__init__(config)
+
+        self.config = config
+        self.num_groups = config.num_groups
+        self.input_channels = config.input_channels
+        self.num_blocks_per_group = config.num_blocks_per_group
+        self.hidden_size = config.hidden_size
+        self.vocab_size = config.vocab_size
+
+        num_layers = self.num_groups * self.num_blocks_per_group
+
+        output_blocks = OrderedDict()
+        output_blocks["relu"] = nn.ReLU()
+        output_blocks["conv"] = nn.Conv2d(8 * self.hidden_size, self.vocab_size, kernel_size=1, padding=0)
+
+        blocks = OrderedDict()
+        blocks["input"] = nn.Conv2d(self.input_channels, 1 * self.hidden_size, kernel_size=7, padding=3)
+        blocks["group_1"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 1 * self.hidden_size
+        )
+        blocks["group_2"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 2 * self.hidden_size
+        )
+        blocks["group_3"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 2 * self.hidden_size, 4 * self.hidden_size
+        )
+        blocks["group_4"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 4 * self.hidden_size, 8 * self.hidden_size, use_pool=False
+        )
+        blocks["output"] = nn.Sequential(output_blocks)
+
+        self.blocks = nn.Sequential(blocks)
+
+        self.post_init()
+
+        if self.config.freeze:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def get_codebook_indices(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        f"""
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
+
+        >>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
+
+        >>> outputs = model.get_codebook_indices(**inputs)
+        ```
+        """
+        z_logits = self.blocks(pixel_values)
+        return torch.argmax(z_logits, axis=1)
+
+    def get_codebook_probs(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        z_logits = self.blocks(pixel_values)
+        return nn.Softmax(dim=1)(z_logits)
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        f"""
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
+
+        >>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
+
+        >>> outputs = model(**inputs)
+        >>> print(outputs.shape)
+        (1, 196)
+        ```
+        """
+        if len(pixel_values.shape) != 4:
+            raise ValueError(f"input shape {pixel_values.shape} is not 4d")
+        if pixel_values.shape[1] != self.input_channels:
+            raise ValueError(f"input has {pixel_values.shape[1]} channels but model built for {self.input_channels}")
+        return self.blocks(pixel_values)
+
+
+class FlavaPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class FlavaMaskedPredictionHead(nn.Module):
+    def __init__(self, config, weight=None):
+        super().__init__()
+        self.config = config
+        self.transform = FlavaPredictionHeadTransform(config)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        if weight is not None:
+            self.decoder.weight = weight
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, x):
+        x = self.transform(x)
+        x = self.decoder(x)
+        return x
+
+
+class FlavaITMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pooler = FlavaPooler(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, x):
+        x = self.pooler(x)
+        x = self.seq_relationship(x)
+        return x
+
+
+class FlavaGlobalContrastiveHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.global_backprop_contrastive = config.global_backprop_contrastive
+
+    def forward(self, image_embeddings, text_embeddings, logit_scale):
+        temperature = torch.exp(logit_scale)
+        if not torch.distributed.is_available() or not torch.distributed.is_initialized():
+            labels = torch.arange(image_embeddings.size(0), device=image_embeddings.device)
+            image_embeddings_all = [image_embeddings]
+            text_embeddings_all = [text_embeddings]
+        else:
+            local_batch_size = image_embeddings.size(0)
+            world_size = torch.distributed.get_world_size()
+
+            if self.global_backprop_contrastive:
+                # `torch.distributed.nn.functional.all_gather` does backprop on all active workers
+                # whereas `torch.distributed.all_gather` does only backpropagates on the current worker.
+                image_embeddings_all = torch.distributed.nn.functional.all_gather(image_embeddings)
+                text_embeddings_all = torch.distributed.nn.functional.all_gather(text_embeddings)
+            else:
+                image_embeddings_all = [torch.zeros_like(text_embeddings) for _ in range(world_size)]
+                text_embeddings_all = [torch.zeros_like(image_embeddings) for _ in range(world_size)]
+                torch.distributed.all_gather(image_embeddings_all, image_embeddings)
+                torch.distributed.all_gather(text_embeddings_all, text_embeddings)
+
+            labels = local_batch_size * torch.distributed.get_rank() + torch.arange(
+                local_batch_size, device=image_embeddings.device
+            )
+
+        image_embeddings_all = torch.cat(image_embeddings_all)
+        text_embeddings_all = torch.cat(text_embeddings_all)
+
+        logits_per_image = torch.matmul(image_embeddings, text_embeddings_all.transpose(0, 1)) * temperature
+        logits_per_text = torch.matmul(text_embeddings, image_embeddings_all.transpose(0, 1)) * temperature
+
+        return logits_per_image, logits_per_text, labels
+
+
+@auto_docstring(
+    custom_intro="""
+    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
+    """
+)
+class FlavaForPreTraining(FlavaPreTrainedModel):
+    # Those are linked to xxx.bias
+    _tied_weights_keys = [
+        "mmm_text_head.decoder.bias",
+        "mmm_image_head.decoder.bias",
+        "mlm_head.decoder.bias",
+        "mim_head.decoder.bias",
+    ]
+
+    def __init__(self, config: FlavaConfig, image_codebook: Optional[nn.Module] = None):
+        r"""
+        image_codebook ([`nn.Module`]):
+            If passed, the image codebook will be set to this. Otherwise, it will be initialized using the
+            image_codebook_config defined in the config first as the first parameter.
+        """
+        super().__init__(config)
+        self.flava = FlavaModel(config)
+
+        self.image_codebook = image_codebook
+        if self.image_codebook is None and config.init_codebook:
+            self.image_codebook = FlavaImageCodebook(config.image_codebook_config)
+
+        # Levarage text and image encoder configs to create the masked
+        # head since it has the right vocab
+        self.mim_head = FlavaMaskedPredictionHead(config.image_config)
+        self.mlm_head = FlavaMaskedPredictionHead(config.text_config)
+        self.itm_head = FlavaITMHead(config)
+        self.mmm_image_head = FlavaMaskedPredictionHead(config.image_config)
+        self.mmm_text_head = FlavaMaskedPredictionHead(config.text_config)
+        self.global_contrastive_head = FlavaGlobalContrastiveHead(config)
+
+        self.image_vocab_size = config.image_config.vocab_size
+        self.text_vocab_size = config.text_config.vocab_size
+        self.mlm_weight = config.mlm_weight
+        self.mim_weight = config.mim_weight
+        self.global_contrastive_weight = config.global_contrastive_weight
+        self.ce_ignore_index = config.ce_ignore_index
+        self.itm_weight = config.itm_weight
+        self.mmm_image_weight = config.mmm_image_weight
+        self.mmm_text_weight = config.mmm_text_weight
+        self.skip_unmasked_multimodal_encoder = config.skip_unmasked_multimodal_encoder
+
+        self.post_init()
+
+    def _resize_to_2d(self, x: torch.Tensor):
+        if x.dim() > 2:
+            x = x.view(x.size(0), -1)
+        return x
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_ids_masked: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        codebook_pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        skip_unmasked_multimodal_encoder: Optional[bool] = None,
+        mlm_labels: Optional[torch.Tensor] = None,
+        mim_labels: Optional[torch.Tensor] = None,
+        itm_labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: bool = True,
+        return_dict: Optional[bool] = None,
+        return_loss: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], FlavaForPreTrainingOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        input_ids_masked (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
+            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
+            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
+            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
+            Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        image_attention_mask (`torch.FloatTensor` of shape `(batch_size, image_num_patches)`, *optional*):
+            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
+            in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        skip_unmasked_multimodal_encoder (*bool*, *optional*):
+            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
+            multimodal embeddings or outputs as of now.
+        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
+            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
+            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
+            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
+            ..., text_config.vocab_size - 1]`.
+        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
+            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
+            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
+            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
+            generated automatically using the image codebook assigned to the model. By default, it uses
+            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.
+        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
+            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
+            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
+        return_loss (`bool`, *optional*, default to None):
+            Whether to return calculated loss or not.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import FlavaForPreTraining, AutoProcessor
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
+
+        >>> text = ["a photo of a cat"]
+
+        >>> inputs = processor(
+        ...     images=[image],
+        ...     text=text,
+        ...     return_masks=True,
+        ...     return_codebook_pixels=True,
+        ...     padding=True,
+        ...     max_length=77,
+        ...     return_tensors="pt",
+        ... )
+
+
+        >>> output = model(**inputs)
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_loss = return_loss if return_loss is not None else self.config.return_loss
+
+        skip_unmasked_multimodal_encoder = (
+            skip_unmasked_multimodal_encoder
+            if skip_unmasked_multimodal_encoder is not None
+            else self.skip_unmasked_multimodal_encoder
+        )
+
+        if input_ids_masked is None and input_ids is not None:
+            logger.warning(
+                "`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to"
+                " `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if"
+                " you are doing inference on unmasked text..."
+            )
+            input_ids_masked = input_ids
+
+        flava_output = self.flava(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            image_attention_mask=image_attention_mask,
+            # Don't need unmasked multimodal embedding for anything so skip it
+            # NOTE: ITM uses masked version
+            skip_multimodal_encoder=skip_unmasked_multimodal_encoder,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            # Pass true to have deterministic outputs
+            return_dict=True,
+        )
+
+        flava_masked_output = self.flava(
+            input_ids=input_ids_masked,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            image_attention_mask=image_attention_mask,
+            bool_masked_pos=bool_masked_pos,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        pos_mask = None
+
+        image_embeddings = flava_output.image_embeddings
+        text_embeddings = flava_output.text_embeddings
+        image_masked_embeddings = flava_masked_output.image_embeddings
+        text_masked_embeddings = flava_masked_output.text_embeddings
+        multimodal_masked_embeddings = flava_masked_output.multimodal_embeddings
+
+        total_loss = mim_loss = mlm_loss = mmm_text_loss = mmm_image_loss = gc_loss = itm_loss = None
+        mim_logits = mlm_logits = mmm_text_logits = mmm_image_logits = None
+        itm_logits = logits_per_image = logits_per_text = None
+
+        # Calculate mim_labels if necessary from the image_codebook
+        if image_masked_embeddings is not None or multimodal_masked_embeddings is not None:
+            if mim_labels is None and return_loss:
+                if self.image_codebook is None:
+                    raise RuntimeError(
+                        "`return_loss` is set to True but the image codebook is not initialized and no `mim_labels` "
+                        " have been passed. Reinstantiate the model with `init_codebook` set to True or "
+                        "pass in your custom `mim_labels`"
+                    )
+                if codebook_pixel_values is None:
+                    raise ValueError(
+                        "`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. "
+                        "Call `AutoProcessor` with `return_codebook_pixels` set to True"
+                    )
+                mim_labels = self.image_codebook.get_codebook_indices(codebook_pixel_values)
+        # Unimodal MIM Loss
+        # If multimodal embeddings are present, we will calculate MMM loss
+        if self.mim_weight > 0 and image_masked_embeddings is not None and multimodal_masked_embeddings is None:
+            sequence_for_image = image_masked_embeddings
+
+            if mim_labels is not None:
+                mim_labels = self._resize_to_2d(mim_labels)
+                bool_masked_pos = self._resize_to_2d(bool_masked_pos)
+                mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index
+
+                sequence_for_image = sequence_for_image[:, -mim_labels.size(1) :, :]
+                masked_tokens = mim_labels.ne(self.ce_ignore_index)
+                mim_labels_filtered = mim_labels[masked_tokens]
+                sequence_for_image = sequence_for_image[masked_tokens, :]
+                mim_logits = self.mim_head(sequence_for_image)
+                if return_loss:
+                    mim_loss = nn.functional.cross_entropy(
+                        mim_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1)
+                    )
+                    mim_loss *= self.mim_weight
+            else:
+                mim_logits = self.mim_head(sequence_for_image)
+
+        # Unimodal MLM Loss
+        if self.mlm_weight > 0 and text_masked_embeddings is not None and multimodal_masked_embeddings is None:
+            sequence_for_text = text_masked_embeddings
+            if mlm_labels is not None:
+                mlm_labels = self._resize_to_2d(mlm_labels)
+                sequence_for_text = sequence_for_text[:, -mlm_labels.size(1) :, :]
+                masked_tokens = mlm_labels.ne(self.ce_ignore_index)
+                mlm_labels_filtered = mlm_labels[masked_tokens]
+                sequence_for_text = sequence_for_text[masked_tokens, :]
+                mlm_logits = self.mlm_head(sequence_for_text)
+                if return_loss:
+                    mlm_loss = nn.functional.cross_entropy(
+                        mlm_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1)
+                    )
+                    mlm_loss *= self.mlm_weight
+            else:
+                mlm_logits = self.mlm_head(sequence_for_text)
+
+        # ITM Loss
+        if self.itm_weight > 0 and multimodal_masked_embeddings is not None:
+            itm_logits = self.itm_head(multimodal_masked_embeddings)
+
+            if itm_labels is not None:
+                pos_pairs = itm_labels.ne(0)
+                pos_mask = torch.where(pos_pairs.any(), pos_pairs, pos_pairs.new([True]))
+                if return_loss:
+                    itm_loss = nn.functional.cross_entropy(itm_logits, itm_labels)
+                    itm_loss *= self.itm_weight
+
+                if multimodal_masked_embeddings is not None:
+                    multimodal_masked_embeddings = multimodal_masked_embeddings[pos_mask]
+
+                if mlm_labels is not None:
+                    mlm_labels = mlm_labels[pos_mask]
+
+                if mim_labels is not None:
+                    mim_labels = mim_labels[pos_mask]
+                    bool_masked_pos = bool_masked_pos[pos_mask]
+
+        # MMM Image Loss
+        if multimodal_masked_embeddings is not None and self.mmm_image_weight > 0:
+            sequence_for_image = multimodal_masked_embeddings
+            end_index = image_masked_embeddings.size(1) - 1
+            sequence_for_image = sequence_for_image[:, 2 : 2 + end_index, :]
+
+            if mim_labels is not None:
+                mim_labels = self._resize_to_2d(mim_labels)
+                bool_masked_pos = self._resize_to_2d(bool_masked_pos)
+                mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index
+
+                masked_tokens = mim_labels.ne(self.ce_ignore_index)
+                mim_labels_filtered = mim_labels[masked_tokens]
+                sequence_for_image = sequence_for_image[masked_tokens, :]
+                mmm_image_logits = self.mmm_image_head(sequence_for_image)
+                if return_loss:
+                    mmm_image_loss = nn.functional.cross_entropy(
+                        mmm_image_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1)
+                    )
+                    mmm_image_loss *= self.mmm_image_weight
+            else:
+                mmm_image_logits = self.mmm_image_head(sequence_for_image)
+
+        # MMM Text Loss
+        if multimodal_masked_embeddings is not None and self.mmm_text_weight > 0:
+            sequence_for_text = multimodal_masked_embeddings
+            sequence_for_text = sequence_for_text[:, -text_masked_embeddings.size(1) :, :]
+
+            if mlm_labels is not None:
+                mlm_labels = self._resize_to_2d(mlm_labels)
+                masked_tokens = mlm_labels.ne(self.ce_ignore_index)
+                mlm_labels_filtered = mlm_labels[masked_tokens]
+                sequence_for_text = sequence_for_text[masked_tokens, :]
+                mmm_text_logits = self.mmm_text_head(sequence_for_text)
+                if return_loss:
+                    mmm_text_loss = nn.functional.cross_entropy(
+                        mmm_text_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1)
+                    )
+                    mmm_text_loss *= self.mmm_text_weight
+            else:
+                mmm_text_logits = self.mmm_text_head(sequence_for_text)
+
+        # Global Contrastive Loss
+        if image_embeddings is not None and text_embeddings is not None and self.global_contrastive_weight > 0:
+            text_embedding = self.flava.text_projection(text_embeddings[:, 0, :])
+            text_embedding = nn.functional.normalize(text_embedding, dim=-1)
+
+            image_embedding = self.flava.image_projection(image_embeddings[:, 0, :])
+            image_embedding = nn.functional.normalize(image_embedding, dim=-1)
+
+            self.flava.logit_scale.data.clamp_(LOGIT_SCALE_CLAMP_MIN, LOGIT_SCALE_CLAMP_MAX)
+
+            logits_per_image, logits_per_text, gc_labels = self.global_contrastive_head(
+                image_embedding, text_embedding, self.flava.logit_scale
+            )
+
+            # Apply ITM negative mask if any
+            if pos_mask is not None:
+                logits_per_image = logits_per_image[pos_mask]
+                logits_per_text = logits_per_text[pos_mask]
+                gc_labels = gc_labels[pos_mask]
+
+            if return_loss:
+                gc_loss_image = nn.functional.cross_entropy(logits_per_image, gc_labels)
+                gc_loss_text = nn.functional.cross_entropy(logits_per_text, gc_labels)
+                gc_loss = (gc_loss_image + gc_loss_text) / 2
+                gc_loss *= self.global_contrastive_weight
+
+        flava_losses = FlavaLosses(
+            mim=mim_loss,
+            mlm=mlm_loss,
+            itm=itm_loss,
+            global_contrastive=gc_loss,
+            mmm_image=mmm_image_loss,
+            mmm_text=mmm_text_loss,
+        )
+
+        if return_loss and not flava_losses.all_none():
+            total_loss = sum(loss if loss is not None else 0 for loss in flava_losses.values())
+
+        if not return_dict:
+            output = (
+                image_embeddings,
+                flava_output.image_output.to_tuple() if flava_output.image_output is not None else None,
+                text_embeddings,
+                flava_output.text_output.to_tuple() if flava_output.text_output is not None else None,
+                flava_output.multimodal_embeddings,
+                flava_output.multimodal_output.to_tuple() if flava_output.multimodal_output is not None else None,
+                image_masked_embeddings,
+                flava_masked_output.image_output.to_tuple() if flava_masked_output.image_output is not None else None,
+                text_masked_embeddings,
+                flava_masked_output.text_output.to_tuple() if flava_masked_output.text_output is not None else None,
+                multimodal_masked_embeddings,
+                flava_masked_output.multimodal_output.to_tuple()
+                if flava_masked_output.multimodal_output is not None
+                else None,
+                mim_logits,
+                mlm_logits,
+                itm_logits,
+                logits_per_image,
+                logits_per_image,
+                mmm_image_logits,
+                mmm_text_logits,
+            )
+            if return_loss and not flava_losses.all_none():
+                output = (
+                    total_loss,
+                    flava_losses,
+                ) + output
+
+            # Filter None as transformer by default won't handle it
+            return tuple(x for x in output if x is None)
+
+        return FlavaForPreTrainingOutput(
+            loss=total_loss,
+            loss_info=flava_losses,
+            image_embeddings=image_embeddings,
+            image_output=flava_output.image_output,
+            text_embeddings=text_embeddings,
+            text_output=flava_output.text_output,
+            multimodal_embeddings=flava_output.multimodal_embeddings,
+            multimodal_output=flava_output.multimodal_output,
+            image_masked_embeddings=image_masked_embeddings,
+            image_masked_output=flava_masked_output.image_output,
+            text_masked_embeddings=text_masked_embeddings,
+            text_masked_output=flava_masked_output.text_output,
+            multimodal_masked_embeddings=multimodal_masked_embeddings,
+            multimodal_masked_output=flava_masked_output.multimodal_output,
+            mim_logits=mim_logits,
+            mlm_logits=mlm_logits,
+            itm_logits=itm_logits,
+            contrastive_logits_per_image=logits_per_image,
+            contrastive_logits_per_text=logits_per_text,
+            mmm_image_logits=mmm_image_logits,
+            mmm_text_logits=mmm_text_logits,
+        )
+
+
+__all__ = [
+    "FlavaForPreTraining",
+    "FlavaImageCodebook",
+    "FlavaImageModel",
+    "FlavaModel",
+    "FlavaMultimodalModel",
+    "FlavaPreTrainedModel",
+    "FlavaTextModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/flava/processing_flava.py b/phivenv/Lib/site-packages/transformers/models/flava/processing_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5e670300f6446c045c1eae5ebfecb4afdc91b5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/flava/processing_flava.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for FLAVA
+"""
+
+import warnings
+from typing import Optional, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class FlavaProcessor(ProcessorMixin):
+    r"""
+    Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
+
+    [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the
+    [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
+        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "FlavaImageProcessor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_image_mask: Optional[bool] = None,
+        return_codebook_pixels: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ):
+        """
+        This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+        if images is not None:
+            image_features = self.image_processor(
+                images,
+                return_image_mask=return_image_mask,
+                return_codebook_pixels=return_codebook_pixels,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+
+        if text is not None and images is not None:
+            encoding.update(image_features)
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["FlavaProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/__init__.py b/phivenv/Lib/site-packages/transformers/models/florence2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..686295d1fbd952e124bab885980ac682261a07fb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/florence2/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_florence2 import *
+    from .modeling_florence2 import *
+    from .processing_florence2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95d99a50abae7fb966c414f2cdf108edebcf95d8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/configuration_florence2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/configuration_florence2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2b8f3857ab7e710efe5cc7845b805337c56efaf
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/configuration_florence2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/modeling_florence2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/modeling_florence2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4e0e9263dd6f18fc26becf933a01b3021885583
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/modeling_florence2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/modular_florence2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/modular_florence2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3302c2646e109d624b4a71bb68d229101858085
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/modular_florence2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/processing_florence2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/processing_florence2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f5e72ec4f0f69ae324717b640258f5d2359a83a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/florence2/__pycache__/processing_florence2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/configuration_florence2.py b/phivenv/Lib/site-packages/transformers/models/florence2/configuration_florence2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bbd4b3a03e9f2c591e77689e1606c60214ab430
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/florence2/configuration_florence2.py
@@ -0,0 +1,215 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/florence2/modular_florence2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_florence2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Florence2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input image channels.
+        depths (`Tuple[int]`, *optional*, defaults to `(1, 1, 9, 1)`):
+            The depth of the model.
+        patch_size (`Tuple[int]`, *optional*, defaults to `(7, 3, 3, 3)`):
+            The patch size of the image.
+        patch_stride (`Tuple[int]`, *optional*, defaults to `(4, 2, 2, 2)`):
+            The patch stride of the image.
+        patch_padding (`Tuple[int]`, *optional*, defaults to `(3, 1, 1, 1)`):
+            The patch padding of the image.
+        patch_prenorm (`Tuple[bool]`, *optional*, defaults to `(False, True, True, True)`):
+            Whether to apply layer normalization before the patch embedding layer.
+        embed_dim (`Tuple[int]`, *optional*, defaults to `(128, 256, 512, 1024)`):
+            The dimension of the embedding layer.
+        num_heads (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`):
+            The number of attention heads.
+        num_groups (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`):
+            The number of groups.
+        window_size (`int`, *optional*, defaults to 12):
+            The window size of the model.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout rate of the drop path layer.
+        mlp_ratio (`int`, *optional*, defaults to 4.0):
+            Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            If True, add a learnable bias to query, key, value.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        projection_dim (`int`, *optional*, defaults to 1024):
+            The dimension of the projection layer.
+        max_temporal_embeddings (`int`, *optional*, defaults to 100):
+            The configuration of the visual temporal embedding.
+        max_position_embeddings (`int`, *optional*, defaults to 50):
+            The configuration of the image position embedding.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    Example:
+
+    ```python
+    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+
+    >>> # Initializing a Florence2 Vision style configuration
+    >>> configuration = Florence2VisionConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "florence_vision"
+
+    def __init__(
+        self,
+        in_channels=3,
+        depths=(1, 1, 9, 1),
+        patch_size=(7, 3, 3, 3),
+        patch_stride=(4, 2, 2, 2),
+        patch_padding=(3, 1, 1, 1),
+        patch_prenorm=(False, True, True, True),
+        embed_dim=(128, 256, 512, 1024),
+        num_heads=(4, 8, 16, 32),
+        num_groups=(4, 8, 16, 32),
+        window_size=12,
+        drop_path_rate=0.1,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        activation_function="gelu",
+        projection_dim=1024,
+        max_temporal_embeddings=100,
+        max_position_embeddings=50,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.in_channels = in_channels
+        self.depths = list(depths)
+        self.patch_size = list(patch_size)
+        self.patch_stride = list(patch_stride)
+        self.patch_padding = list(patch_padding)
+        self.patch_prenorm = list(patch_prenorm)
+        self.embed_dim = list(embed_dim)
+        self.num_heads = list(num_heads)
+        self.num_groups = list(num_groups)
+        self.window_size = window_size
+        self.drop_path_rate = drop_path_rate
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.projection_dim = projection_dim
+        self.max_temporal_embeddings = max_temporal_embeddings
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.activation_function = activation_function
+
+        super().__init__(**kwargs)
+
+
+class Florence2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Florence-2
+    [microsoft/Florence-2-base](https://huggingface.co/microsoft/Florence-2-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AutoConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Florence2VisionConfig`].
+        image_token_id (`int`, *optional*, defaults to 51289):
+            The image token index to encode the image prompt.
+        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+
+    Example:
+
+    ```python
+    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+
+    >>> # Initializing a clip-like vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Bart config
+    >>> text_config = BartConfig()
+
+    >>> # Initializing a Florence-2 configuration
+    >>> configuration = Florence2Config(vision_config, text_config)
+
+    >>> # Initializing a model from the florence-2 configuration
+    >>> model = Florence2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "florence2"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "vision_config": Florence2VisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=51289,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "bart")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["bart"]()
+
+        if isinstance(vision_config, dict):
+            vision_config = Florence2VisionConfig(**vision_config)
+        elif vision_config is None:
+            logger.info("vision_config is None. Initializing the Florence2VisionConfig with default values.")
+            vision_config = Florence2VisionConfig()
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.image_token_id = image_token_id
+
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+
+__all__ = ["Florence2Config", "Florence2VisionConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/modeling_florence2.py b/phivenv/Lib/site-packages/transformers/models/florence2/modeling_florence2.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a00843e32efc5e3c8460d1b901c01d60065481
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/florence2/modeling_florence2.py
@@ -0,0 +1,1028 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/florence2/modular_florence2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_florence2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import Seq2SeqLMOutput, Seq2SeqModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torch_available,
+)
+from ..auto import AutoModel
+from .configuration_florence2 import Florence2Config, Florence2VisionConfig
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class Florence2VisionDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, config: Florence2Config):
+        super().__init__()
+        num_pos = config.vision_config.max_position_embeddings
+        embedding_dim = config.vision_config.embed_dim[-1]
+        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+class Florence2VisionPositionalEmbeddingCosine1D(nn.Module):
+    """
+    This module generates 1D cosine positional embeddings using precomputed sinusoidal functions.
+    """
+
+    def __init__(self, config: Florence2Config):
+        super().__init__()
+        self.embed_dim = config.vision_config.embed_dim[-1]
+        self.max_seq_len = config.vision_config.max_temporal_embeddings
+        pos_idx_to_embed = torch.empty((self.max_seq_len, self.embed_dim))
+        sine, cosine = self.get_sinusoid_embeddings(
+            max_positions=self.max_seq_len,
+            embed_dim=self.embed_dim,
+        )
+        pos_idx_to_embed[:, 0::2] = sine
+        pos_idx_to_embed[:, 1::2] = cosine
+        # Save the positional embeddings in a constant buffer.
+        self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
+
+    @staticmethod
+    def get_sinusoid_embeddings(max_positions: int, embed_dim: int):
+        half_dim = embed_dim // 2
+        emb = math.log(10000) / half_dim
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        return torch.sin(emb), torch.cos(emb)
+
+    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
+        len_seq = seq_embeds.size(1)
+        if len_seq > self.max_seq_len:
+            raise ValueError(f"Maximum sequence length {self.max_seq_len}, got {len_seq}")
+        pos_embeds = self.pos_idx_to_embed[0:len_seq, :]
+        return pos_embeds
+
+
+class Florence2VisionMLP(nn.Module):
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.fc1 = nn.Linear(config.embed_dim[stage_idx], int(config.embed_dim[stage_idx] * config.mlp_ratio))
+        self.fc2 = nn.Linear(int(config.embed_dim[stage_idx] * config.mlp_ratio), config.embed_dim[stage_idx])
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Florence2VisionConvEmbed(nn.Module):
+    """Image to Patch Embedding"""
+
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__()
+        self.config = config
+        self.stage_idx = stage_idx
+        self.patch_size = config.patch_size[stage_idx]
+        self.in_channels = config.in_channels if stage_idx == 0 else config.embed_dim[stage_idx - 1]
+        self.embed_dim = config.embed_dim[stage_idx]
+        self.stride = config.patch_stride[stage_idx]
+        self.padding = config.patch_padding[stage_idx]
+        self.pre_norm = config.patch_prenorm[stage_idx]
+
+        self.conv = nn.Conv2d(
+            self.in_channels,
+            self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.stride,
+            padding=self.padding,
+        )
+
+        dim_norm = self.in_channels if self.pre_norm else self.embed_dim
+        self.norm = nn.LayerNorm(dim_norm)
+
+    def forward(self, hidden_states: torch.Tensor):
+        if self.norm and self.pre_norm:
+            hidden_states = hidden_states.permute(0, 2, 3, 1)
+            hidden_states = self.norm(hidden_states)
+            hidden_states = hidden_states.permute(0, 3, 1, 2)
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm and not self.pre_norm:
+            hidden_states = hidden_states.permute(0, 2, 3, 1)
+            hidden_states = self.norm(hidden_states)
+            hidden_states = hidden_states.permute(0, 3, 1, 2)
+        return hidden_states
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Florence2VisionChannelAttention(nn.Module):
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__()
+        self.config = config
+        self.dim = config.embed_dim[stage_idx]
+        self.groups = config.num_groups[stage_idx]
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.is_causal = False
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, num_tokens, hidden_size = hidden_states.shape
+
+        # Reshape for grouped channel attention
+        qkv = self.qkv(hidden_states).reshape(batch_size, num_tokens, 3, self.groups, hidden_size // self.groups)
+        qkv = qkv.permute(2, 0, 3, 4, 1)
+        query, key, value = qkv.unbind(0)
+
+        scale = num_tokens**-0.5
+        # Channel-to-channel attention within groups:
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        hidden_states, _ = attention_interface(
+            self,
+            query,
+            key,
+            value,
+            attention_mask=None,
+            scaling=scale,
+        )
+        hidden_states = hidden_states.permute(0, 3, 2, 1)
+        hidden_states = hidden_states.reshape(batch_size, num_tokens, hidden_size)
+
+        # Final projection
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+
+class Florence2VisionChannelBlock(nn.Module):
+    def __init__(
+        self,
+        config: Florence2VisionConfig,
+        stage_idx: int,
+        drop_path_rate: float,
+    ):
+        super().__init__()
+
+        self.config = config
+        dim_in = config.embed_dim[stage_idx]
+
+        self.conv1 = nn.Conv2d(
+            dim_in,
+            dim_in,
+            kernel_size=3,
+            padding=1,
+            groups=dim_in,
+        )
+        self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.channel_attn = Florence2VisionChannelAttention(config=config, stage_idx=stage_idx)
+        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+        self.conv2 = nn.Conv2d(
+            dim_in,
+            dim_in,
+            kernel_size=3,
+            padding=1,
+            groups=dim_in,
+        )
+        self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx)
+        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, embed_dim, height, width = hidden_states.shape
+
+        # First channel block: Depthwise Conv + Channel Attention
+        hidden_states = self.conv1(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # Channel group attention self-attention mechanism
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.channel_attn(hidden_states)
+        hidden_states = residual + self.drop_path1(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        # Second channel block: Depthwise Conv + FFN
+        hidden_states = self.conv2(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # FFN
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = residual + self.drop_path2(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        return hidden_states
+
+
+class Florence2VisionWindowAttention(nn.Module):
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__()
+        self.config = config
+        self.dim = config.embed_dim[stage_idx]
+        self.window_size = config.window_size
+        self.num_heads = config.num_heads[stage_idx]
+        head_dim = self.dim // self.num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.is_causal = False
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, height, width, embed_dim = hidden_states.shape
+
+        # Pad the input if necessary
+        pad_left = pad_top = 0
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        hidden_states = F.pad(hidden_states, (0, 0, pad_left, pad_right, pad_top, pad_bottom))
+        _, padded_height, padded_width, _ = hidden_states.shape
+
+        # Partition input into non-overlapping windows (for local spatial attention in DaViT)
+        hidden_states = hidden_states.view(
+            batch_size,
+            padded_height // self.window_size,
+            self.window_size,
+            padded_width // self.window_size,
+            self.window_size,
+            embed_dim,
+        )
+        windowed_hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size * self.window_size, embed_dim)
+
+        # Generate Q, K, V for each window
+        num_windows_per_batch, num_tokens_per_window, embed_dim = windowed_hidden_states.shape
+        qkv = self.qkv(windowed_hidden_states).reshape(
+            num_windows_per_batch, num_tokens_per_window, 3, self.num_heads, embed_dim // self.num_heads
+        )
+        qkv = qkv.permute(2, 0, 3, 1, 4)
+        query, key, value = qkv.unbind(0)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        windowed_hidden_states, _ = attention_interface(
+            self,
+            query,
+            key,
+            value,
+            attention_mask=None,
+            scaling=self.scale,
+        )
+        windowed_hidden_states = windowed_hidden_states.view(num_windows_per_batch, num_tokens_per_window, embed_dim)
+        windowed_hidden_states = self.proj(windowed_hidden_states)
+
+        # Merge windows back to original spatial layout
+        windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size, self.window_size, embed_dim)
+        hidden_states = windowed_hidden_states.view(
+            -1,
+            padded_height // self.window_size,
+            padded_width // self.window_size,
+            self.window_size,
+            self.window_size,
+            embed_dim,
+        )
+        hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous()
+        hidden_states = hidden_states.view(-1, padded_height, padded_width, embed_dim)
+        hidden_states = hidden_states[:, :height, :width, :].contiguous()
+        hidden_states = hidden_states.view(batch_size, height * width, embed_dim)
+
+        return hidden_states
+
+
+class Florence2VisionSpatialBlock(nn.Module):
+    def __init__(
+        self,
+        config: Florence2VisionConfig,
+        stage_idx: int,
+        drop_path_rate: float,
+    ):
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            config.embed_dim[stage_idx],
+            config.embed_dim[stage_idx],
+            kernel_size=3,
+            padding=1,
+            groups=config.embed_dim[stage_idx],
+        )
+        self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.window_attn = Florence2VisionWindowAttention(config=config, stage_idx=stage_idx)
+        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+        self.conv2 = nn.Conv2d(
+            config.embed_dim[stage_idx],
+            config.embed_dim[stage_idx],
+            kernel_size=3,
+            padding=1,
+            groups=config.embed_dim[stage_idx],
+        )
+        self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx)
+        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, embed_dim, height, width = hidden_states.shape
+
+        # First spatial mixing block: Conv + Window Attention
+        hidden_states = self.conv1(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # Spatial Window-based self-attention mechanism
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = hidden_states.view(batch_size, height, width, embed_dim)
+        hidden_states = self.window_attn(hidden_states)
+        hidden_states = residual + self.drop_path1(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        # Second spatial mixing block: Conv + FFN
+        hidden_states = self.conv2(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # FFN
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = residual + self.drop_path2(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        return hidden_states
+
+
+class Florence2VisionBlock(nn.Module):
+    def __init__(
+        self,
+        config: Florence2VisionConfig,
+        stage_idx: int,
+        spatial_drop_path_rate: float,
+        channel_drop_path_rate: float,
+    ):
+        super().__init__()
+        self.spatial_block = Florence2VisionSpatialBlock(
+            config=config,
+            stage_idx=stage_idx,
+            drop_path_rate=spatial_drop_path_rate,
+        )
+        self.channel_block = Florence2VisionChannelBlock(
+            config=config,
+            stage_idx=stage_idx,
+            drop_path_rate=channel_drop_path_rate,
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.spatial_block(hidden_states)
+        hidden_states = self.channel_block(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class Florence2VisionPreTrainedModel(PreTrainedModel):
+    config_class = Florence2VisionConfig
+    main_input_name = "pixel_values"
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+
+
+@auto_docstring
+class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
+    def __init__(self, config: Florence2VisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embed_dim = config.embed_dim
+        self.num_heads = config.num_heads
+        self.num_groups = config.num_groups
+        self.num_stages = len(self.embed_dim)
+
+        if not (self.num_stages == len(self.num_heads) == len(self.num_groups)):
+            raise ValueError(
+                f"Expected self.num_stages ({self.num_stages}) == "
+                f"len(self.num_heads) ({len(self.num_heads)}) == "
+                f"len(self.num_groups) ({len(self.num_groups)})"
+            )
+
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths) * 2, device="cpu")]
+        depth_offset = 0
+
+        convs = []
+        blocks = []
+        for stage_idx in range(self.num_stages):
+            conv_embed = Florence2VisionConvEmbed(
+                config=config,
+                stage_idx=stage_idx,
+            )
+            convs.append(conv_embed)
+
+            block = nn.ModuleList(
+                Florence2VisionBlock(
+                    config=config,
+                    stage_idx=stage_idx,
+                    spatial_drop_path_rate=dpr[depth_offset + block_idx * 2],
+                    channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1],
+                )
+                for block_idx in range(config.depths[stage_idx])
+            )
+            blocks.append(block)
+            depth_offset += config.depths[stage_idx] * 2
+
+        self.convs = nn.ModuleList(convs)
+        self.blocks = nn.ModuleList(blocks)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self, hidden_states: torch.Tensor):
+        for conv, block in zip(self.convs, self.blocks):
+            hidden_states = conv(hidden_states)
+            for layer in block:
+                hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class Florence2MultiModalProjector(nn.Module):
+    def __init__(self, config: Florence2Config):
+        super().__init__()
+        self.vision_embedding_dim = config.vision_config.embed_dim[-1]
+        self.vision_projection_dim = config.vision_config.projection_dim
+        self.image_projection = nn.Linear(self.vision_embedding_dim, self.vision_projection_dim, bias=False)
+        self.image_proj_norm = nn.LayerNorm(self.vision_projection_dim)
+        self.image_position_embed = Florence2VisionLearnedAbsolutePositionEmbedding2D(config=config)
+        self.visual_temporal_embed = Florence2VisionPositionalEmbeddingCosine1D(config=config)
+
+    def forward(self, image_features):
+        position_features = image_features + self.image_position_embed(image_features)
+        position_features = position_features.flatten(2).transpose(1, 2)
+        temporal_features = self.visual_temporal_embed(position_features[:, :1, :])
+        temporal_features = temporal_features.unsqueeze(1)
+        visual_token_features = position_features + temporal_features
+        visual_token_features = visual_token_features.unsqueeze(1)
+        spatial_image_features = visual_token_features.mean(dim=2)
+        temporal_image_features = visual_token_features.mean(dim=1)
+        image_features = torch.cat([spatial_image_features, temporal_image_features], dim=1)
+        image_features = self.image_projection(image_features)
+        image_features = self.image_proj_norm(image_features)
+        return image_features
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Florence-2 base model's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+    """
+)
+class Florence2Seq2SeqModelOutput(Seq2SeqModelOutput):
+    r"""
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Florence-2 model's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+    """
+)
+class Florence2Seq2SeqLMOutput(Seq2SeqLMOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@auto_docstring
+class Florence2PreTrainedModel(PreTrainedModel):
+    config: Florence2Config
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+
+    _supports_attention_backend = False
+    config_class = Florence2Config
+
+
+@auto_docstring(
+    custom_intro="""
+    Florence-2 is a vision model for captioning, detection, and segmentation.
+    """
+)
+class Florence2Model(Florence2PreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = [
+        "language_model.encoder.embed_tokens.weight",
+        "language_model.decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: Florence2Config):
+        super().__init__(config)
+        self.vision_tower = Florence2VisionBackbone(config=config.vision_config)
+
+        self.multi_modal_projector = Florence2MultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_image_features(self, pixel_values: torch.Tensor, **kwargs):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_features = self.vision_tower(pixel_values, **kwargs)
+        image_embeds = self.multi_modal_projector(image_features)
+        return image_embeds
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[list[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Florence2Seq2SeqModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            if (input_ids is None) ^ (inputs_embeds is not None):
+                raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            if pixel_values is not None:
+                image_features = self.get_image_features(pixel_values)
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                special_image_mask = self.get_placeholder_mask(
+                    input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+            encoder_outputs = self.language_model.encoder(
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+            )
+
+        if decoder_input_ids is None:
+            decoder_start_token_id = self.config.text_config.decoder_start_token_id
+            decoder_input_ids = torch.ones((inputs_embeds.size()[0], 1), dtype=torch.long, device=inputs_embeds.device)
+            decoder_input_ids *= decoder_start_token_id
+
+        decoder_outputs = self.language_model.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=True,
+            **kwargs,
+        )
+
+        return Florence2Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+
+@auto_docstring(
+    custom_intro="""
+    Florence-2 is a vision model for captioning, detection, and segmentation.
+    """
+)
+class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = [
+        "model.language_model.encoder.embed_tokens.weight",
+        "model.language_model.decoder.embed_tokens.weight",
+        "lm_head.weight",
+    ]
+
+    def __init__(self, config: Florence2Config):
+        super().__init__(config)
+        self.model = Florence2Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(self, pixel_values: torch.Tensor, **kwargs):
+        return self.model.get_image_features(pixel_values=pixel_values, **kwargs)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[list[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Florence2Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Florence2ForConditionalGeneration
+
+        >>> model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-large")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large")
+
+        >>> prompt = "<CAPTION>"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=100)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "A green car parked in front of a yellow building."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Florence2Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        return self.model.get_placeholder_mask(
+            input_ids=input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+        )
+
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self,
+        inputs_tensor: torch.Tensor,
+        model_kwargs,
+        model_input_name: Optional[str],
+        generation_config,
+    ) -> dict[str, Any]:
+        # override to handle merging image and text embeddings before passing to language encoder
+        inputs_embeds = model_kwargs.pop("inputs_embeds", None)
+        pixel_values = model_kwargs.pop("pixel_values", None)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(inputs_tensor)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                inputs_tensor, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        model_kwargs["inputs_embeds"] = inputs_embeds
+        model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation(
+            None, model_kwargs, model_input_name, generation_config
+        )
+        model_kwargs.pop("inputs_embeds", None)
+        return model_kwargs
+
+
+__all__ = [
+    "Florence2Model",
+    "Florence2ForConditionalGeneration",
+    "Florence2PreTrainedModel",
+    "Florence2VisionBackbone",
+    "Florence2VisionPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/modular_florence2.py b/phivenv/Lib/site-packages/transformers/models/florence2/modular_florence2.py
new file mode 100644
index 0000000000000000000000000000000000000000..417e296071de7e415cc1cc8dd551471855289824
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/florence2/modular_florence2.py
@@ -0,0 +1,1807 @@
+# coding=utf-8
+# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import re
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...configuration_utils import PretrainedConfig
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import Seq2SeqLMOutput, Seq2SeqModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import MultiModalData, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torch_available,
+    logging,
+)
+from ..auto import CONFIG_MAPPING, AutoConfig
+from ..bart.modeling_bart import eager_attention_forward
+from ..beit.modeling_beit import BeitDropPath
+from ..llama4.modeling_llama4 import Llama4VisionMLP
+from ..llava.modeling_llava import LlavaForConditionalGeneration, LlavaModel, LlavaPreTrainedModel
+from ..llava.processing_llava import LlavaProcessorKwargs
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+class Florence2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input image channels.
+        depths (`Tuple[int]`, *optional*, defaults to `(1, 1, 9, 1)`):
+            The depth of the model.
+        patch_size (`Tuple[int]`, *optional*, defaults to `(7, 3, 3, 3)`):
+            The patch size of the image.
+        patch_stride (`Tuple[int]`, *optional*, defaults to `(4, 2, 2, 2)`):
+            The patch stride of the image.
+        patch_padding (`Tuple[int]`, *optional*, defaults to `(3, 1, 1, 1)`):
+            The patch padding of the image.
+        patch_prenorm (`Tuple[bool]`, *optional*, defaults to `(False, True, True, True)`):
+            Whether to apply layer normalization before the patch embedding layer.
+        embed_dim (`Tuple[int]`, *optional*, defaults to `(128, 256, 512, 1024)`):
+            The dimension of the embedding layer.
+        num_heads (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`):
+            The number of attention heads.
+        num_groups (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`):
+            The number of groups.
+        window_size (`int`, *optional*, defaults to 12):
+            The window size of the model.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout rate of the drop path layer.
+        mlp_ratio (`int`, *optional*, defaults to 4.0):
+            Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            If True, add a learnable bias to query, key, value.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        projection_dim (`int`, *optional*, defaults to 1024):
+            The dimension of the projection layer.
+        max_temporal_embeddings (`int`, *optional*, defaults to 100):
+            The configuration of the visual temporal embedding.
+        max_position_embeddings (`int`, *optional*, defaults to 50):
+            The configuration of the image position embedding.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    Example:
+
+    ```python
+    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+
+    >>> # Initializing a Florence2 Vision style configuration
+    >>> configuration = Florence2VisionConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "florence_vision"
+
+    def __init__(
+        self,
+        in_channels=3,
+        depths=(1, 1, 9, 1),
+        patch_size=(7, 3, 3, 3),
+        patch_stride=(4, 2, 2, 2),
+        patch_padding=(3, 1, 1, 1),
+        patch_prenorm=(False, True, True, True),
+        embed_dim=(128, 256, 512, 1024),
+        num_heads=(4, 8, 16, 32),
+        num_groups=(4, 8, 16, 32),
+        window_size=12,
+        drop_path_rate=0.1,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        activation_function="gelu",
+        projection_dim=1024,
+        max_temporal_embeddings=100,
+        max_position_embeddings=50,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.in_channels = in_channels
+        self.depths = list(depths)
+        self.patch_size = list(patch_size)
+        self.patch_stride = list(patch_stride)
+        self.patch_padding = list(patch_padding)
+        self.patch_prenorm = list(patch_prenorm)
+        self.embed_dim = list(embed_dim)
+        self.num_heads = list(num_heads)
+        self.num_groups = list(num_groups)
+        self.window_size = window_size
+        self.drop_path_rate = drop_path_rate
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.projection_dim = projection_dim
+        self.max_temporal_embeddings = max_temporal_embeddings
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.activation_function = activation_function
+
+        super().__init__(**kwargs)
+
+
+class Florence2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Florence-2
+    [microsoft/Florence-2-base](https://huggingface.co/microsoft/Florence-2-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AutoConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Florence2VisionConfig`].
+        image_token_id (`int`, *optional*, defaults to 51289):
+            The image token index to encode the image prompt.
+        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+
+    Example:
+
+    ```python
+    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+
+    >>> # Initializing a clip-like vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Bart config
+    >>> text_config = BartConfig()
+
+    >>> # Initializing a Florence-2 configuration
+    >>> configuration = Florence2Config(vision_config, text_config)
+
+    >>> # Initializing a model from the florence-2 configuration
+    >>> model = Florence2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "florence2"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "vision_config": Florence2VisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=51289,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "bart")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["bart"]()
+
+        if isinstance(vision_config, dict):
+            vision_config = Florence2VisionConfig(**vision_config)
+        elif vision_config is None:
+            logger.info("vision_config is None. Initializing the Florence2VisionConfig with default values.")
+            vision_config = Florence2VisionConfig()
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.image_token_id = image_token_id
+
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+
+class Florence2ProcessorKwargs(LlavaProcessorKwargs):
+    pass
+
+
+class Florence2Processor(ProcessorMixin):
+    r"""
+    Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
+
+    [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the
+    [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
+
+    Args:
+        image_processor (`AutoImageProcessor`, *optional*):
+            The image processor is a required input.
+        tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*):
+            The tokenizer is a required input.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        post_processor_config (`dict`,  *optional*, defaults to 0):
+            Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns,
+            thresholds, or banned tokens.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        num_additional_image_tokens: int = 0,
+        post_processor_config: Optional[dict] = None,
+        **kwargs,
+    ):
+        self.tasks_answer_post_processing_type = {
+            "<OCR>": "pure_text",
+            "<OCR_WITH_REGION>": "ocr",
+            "<CAPTION>": "pure_text",
+            "<DETAILED_CAPTION>": "pure_text",
+            "<MORE_DETAILED_CAPTION>": "pure_text",
+            "<OD>": "description_with_bboxes",
+            "<DENSE_REGION_CAPTION>": "description_with_bboxes",
+            "<CAPTION_TO_PHRASE_GROUNDING>": "phrase_grounding",
+            "<REFERRING_EXPRESSION_SEGMENTATION>": "polygons",
+            "<REGION_TO_SEGMENTATION>": "polygons",
+            "<OPEN_VOCABULARY_DETECTION>": "description_with_bboxes_or_polygons",
+            "<REGION_TO_CATEGORY>": "pure_text",
+            "<REGION_TO_DESCRIPTION>": "pure_text",
+            "<REGION_TO_OCR>": "pure_text",
+            "<REGION_PROPOSAL>": "bboxes",
+        }
+
+        self.task_prompts_without_inputs = {
+            "<OCR>": "What is the text in the image?",
+            "<OCR_WITH_REGION>": "What is the text in the image, with regions?",
+            "<CAPTION>": "What does the image describe?",
+            "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.",
+            "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.",
+            "<OD>": "Locate the objects with category name in the image.",
+            "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.",
+            "<REGION_PROPOSAL>": "Locate the region proposals in the image.",
+        }
+
+        self.task_prompts_with_input = {
+            "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}",
+            "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask",
+            "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}",
+            "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.",
+            "<REGION_TO_CATEGORY>": "What is the region {input}?",
+            "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?",
+            "<REGION_TO_OCR>": "What text is in the region {input}?",
+        }
+
+        self.num_image_tokens = image_processor.image_seq_length
+        self.num_additional_image_tokens = num_additional_image_tokens
+        self.post_processor_config = post_processor_config
+        self.post_processor = Florence2PostProcessor(config=post_processor_config, tokenizer=tokenizer)
+        self.image_token = tokenizer.image_token
+        self.image_token_id = tokenizer.image_token_id
+
+        super().__init__(image_processor, tokenizer, **kwargs)
+
+    def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]:
+        """
+        Construct prompts by replacing task tokens with corresponding prompt strings.
+        """
+        if isinstance(text, str):
+            text = [text]
+
+        prompts = []
+        for prompt in text:
+            # Check for tasks without inputs
+            for task_token, task_prompt in self.task_prompts_without_inputs.items():
+                if task_token in prompt:
+                    if prompt != task_token:
+                        raise ValueError(f"Task token {task_token} should be the only content in the prompt.")
+                    prompt = task_prompt
+                    break
+            # Check for tasks with inputs
+            for task_token, task_prompt in self.task_prompts_with_input.items():
+                if task_token in prompt:
+                    input_text = prompt.replace(task_token, "").strip()
+                    prompt = task_prompt.format(input=input_text)
+                    break
+            prompts.append(prompt)
+        return prompts
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[Florence2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+
+        output_kwargs = self._merge_kwargs(
+            Florence2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+        if text is None:
+            logger.warning_once("You are using Florence-2 without a text prefix.")
+            text = [""] * (1 if not isinstance(images, list) else len(images))
+        elif isinstance(text, str):
+            text = [text]
+
+        if not isinstance(text, list) or not all(isinstance(token, str) for token in text):
+            raise ValueError("`text` must be a string or list of strings.")
+
+        if isinstance(images, list) and len(images) != len(text):
+            raise ValueError(f"Number of images ({len(images)}) must match number of texts ({len(text)}).")
+
+        prompt_strings = self._construct_prompts(text)
+
+        # Add image tokens and special tokens if images are provided
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            expanded_image_prompts = []
+            for sample in prompt_strings:
+                sample = (
+                    self.image_token * self.num_image_tokens
+                    + self.tokenizer.bos_token
+                    + sample
+                    + self.tokenizer.eos_token
+                )
+                expanded_image_prompts.append(sample)
+            prompt_strings = expanded_image_prompts
+
+        # Construct and tokenize prompts
+        output_kwargs["text_kwargs"].pop("add_special_tokens", None)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(
+            prompt_strings, **output_kwargs["text_kwargs"], add_special_tokens=False, return_tensors=None
+        )
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**image_inputs, **text_inputs}, tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            num_image_tokens = [self.image_seq_length] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=False, **kwargs):
+        """
+        Post-processes the output of `FuyuForConditionalGeneration` to only return the text output.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                containing the token ids of the generated sequences.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text output.
+        """
+        return self.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def post_process_generation(self, text=None, sequence=None, task=None, image_size=None) -> dict[str, Any]:
+        """
+        Post-process generation outputs based on the task.
+
+        Args:
+            text (`str`, *optional*):
+                Generated text.
+            sequence (`Union[List[int], torch.Tensor]`, *optional*):
+                Generated token sequence.
+            task (`str`, *optional*):
+                The task for post-processing.
+            image_size (`Tuple[int, int]`, *optional*):
+                Image size for dequantization.
+
+        Returns:
+            `Dict[str, Any]`: Post-processed results keyed by task.
+        """
+        if task is None:
+            raise ValueError("`task` must be provided for post-processing.")
+
+        post_proc_type = self.tasks_answer_post_processing_type.get(task, "pure_text")
+        parsed = self.post_processor(
+            text=text,
+            sequence=sequence,
+            image_size=image_size,
+            parse_tasks=[post_proc_type],
+        )[post_proc_type]
+
+        if post_proc_type == "pure_text":
+            final_answer = parsed.replace("<s>", "").replace("</s>", "").strip()
+        elif post_proc_type in ["description_with_bboxes", "bboxes"]:
+            bboxes = [inst["bbox"] for inst in parsed]
+            labels = [inst["cat_name"] for inst in parsed]
+            final_answer = {"bboxes": bboxes, "labels": labels}
+            if parsed and "score" in parsed[0]:
+                final_answer["scores"] = [inst["score"] for inst in parsed]
+        elif post_proc_type == "ocr":
+            quad_boxes = [inst["quad_box"] for inst in parsed]
+            labels = [inst["text"] for inst in parsed]
+            final_answer = {"quad_boxes": quad_boxes, "labels": labels}
+        elif post_proc_type == "phrase_grounding":
+            bboxes = []
+            labels = []
+            for inst in parsed:
+                for bbox in inst["bbox"]:
+                    bboxes.append(bbox)
+                    labels.append(inst["cat_name"])
+            final_answer = {"bboxes": bboxes, "labels": labels}
+        elif post_proc_type in ["description_with_polygons", "polygons"]:
+            polygons = [inst["polygons"] for inst in parsed]
+            labels = [inst["cat_name"] for inst in parsed]
+            final_answer = {"polygons": polygons, "labels": labels}
+        elif post_proc_type == "description_with_bboxes_or_polygons":
+            bboxes = []
+            bboxes_labels = []
+            polygons = []
+            polygons_labels = []
+            for inst in parsed:
+                label = inst["cat_name"]
+                if "polygons" in inst:
+                    polygons.append(inst["polygons"])
+                    polygons_labels.append(label)
+                else:
+                    bboxes.append(inst["bbox"])
+                    bboxes_labels.append(label)
+            final_answer = {
+                "bboxes": bboxes,
+                "bboxes_labels": bboxes_labels,
+                "polygons": polygons,
+                "polygons_labels": polygons_labels,
+            }
+        else:
+            raise ValueError(f"Unknown post-processing type: {post_proc_type}")
+
+        return {task: final_answer}
+
+
+class Florence2PostProcessor:
+    """
+    Post-processor for Florence-2 model outputs. Parses generated text into structured results for various tasks
+    like object detection, OCR, phrase grounding, etc.
+
+    Args:
+        tokenizer (`PreTrainedTokenizer`):
+            The tokenizer used for decoding model outputs.
+    """
+
+    def __init__(self, config, tokenizer):
+        self.tokenizer = tokenizer
+        self.parse_task_config = config or {}
+        self.banned_grounding_tokens = set(
+            self.parse_task_config.get("phrase_grounding", {}).get("banned_grounding_tokens", [])
+        )
+        self.all_special_tokens = set(self.tokenizer.all_special_tokens)
+        self.quantize_bins = (1000, 1000)
+
+    def quantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor":
+        """
+        Quantize locations.
+
+        Args:
+            locations (`torch.Tensor`):
+                Tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates.
+            size (`tuple[int, int]`):
+                Original image size (width, height).
+
+        Returns:
+            `torch.Tensor`: Quantized locations as integers.
+        """
+        bins_w, bins_h = self.quantize_bins
+        size_w, size_h = size
+        per_bin_w = size_w / bins_w
+        per_bin_h = size_h / bins_h
+
+        if locations.shape[-1] == 4:  # Bounding boxes: [xmin, ymin, xmax, ymax]
+            xmin, ymin, xmax, ymax = locations.split(1, dim=-1)
+            q_xmin = (xmin / per_bin_w).floor().clamp(0, bins_w - 1)
+            q_ymin = (ymin / per_bin_h).floor().clamp(0, bins_h - 1)
+            q_xmax = (xmax / per_bin_w).floor().clamp(0, bins_w - 1)
+            q_ymax = (ymax / per_bin_h).floor().clamp(0, bins_h - 1)
+            return torch.cat([q_xmin, q_ymin, q_xmax, q_ymax], dim=-1).int()
+
+        elif locations.shape[-1] == 2:  # Points/coordinates: [x, y]
+            x, y = locations.split(1, dim=-1)
+            q_x = (x / per_bin_w).floor().clamp(0, bins_w - 1)
+            q_y = (y / per_bin_h).floor().clamp(0, bins_h - 1)
+            return torch.cat([q_x, q_y], dim=-1).int()
+
+        else:
+            raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.")
+
+    def dequantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor":
+        """
+        Dequantize locations back to original scale.
+
+        Args:
+            locations (`torch.Tensor`):
+                Quantized tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates.
+            size (`tuple[int, int]`):
+                Original image size (width, height).
+
+        Returns:
+            `torch.Tensor`: Dequantized locations as floats.
+        """
+        bins_w, bins_h = self.quantize_bins
+        size_w, size_h = size
+        per_bin_w = size_w / bins_w
+        per_bin_h = size_h / bins_h
+
+        # Add 0.5 to use the center position of the bin as the coordinate.
+        if locations.shape[-1] == 4:  # Bounding boxes
+            xmin, ymin, xmax, ymax = locations.split(1, dim=-1)
+            dq_xmin = (xmin + 0.5) * per_bin_w
+            dq_ymin = (ymin + 0.5) * per_bin_h
+            dq_xmax = (xmax + 0.5) * per_bin_w
+            dq_ymax = (ymax + 0.5) * per_bin_h
+            return torch.cat([dq_xmin, dq_ymin, dq_xmax, dq_ymax], dim=-1).int()
+
+        elif locations.shape[-1] == 2:  # Points/coordinates
+            x, y = locations.split(1, dim=-1)
+            dq_x = (x + 0.5) * per_bin_w
+            dq_y = (y + 0.5) * per_bin_h
+            return torch.cat([dq_x, dq_y], dim=-1).int()
+
+        else:
+            raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.")
+
+    def decode_with_spans(self, token_ids: list[int]) -> tuple[str, list[tuple[int, int]]]:
+        """
+        Decode token IDs to text and compute character spans.
+
+        Args:
+            token_ids (`list[int]`):
+                list of token IDs to decode.
+
+        Returns:
+            `tuple[str, list[tuple[int, int]]]`: Decoded text and list of spans (start, end) for each token.
+        """
+        filtered_tokens = self.tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=False)
+        text = ""
+        spans = []
+        for token in filtered_tokens:
+            if token in self.all_special_tokens:
+                sub_text = token
+            else:
+                sub_text = self.tokenizer.convert_tokens_to_string([token])
+            span = (len(text), len(text) + len(sub_text))
+            text += sub_text
+            spans.append(span)
+        return text, spans
+
+    def parse_ocr_from_text_and_spans(
+        self, text: str, pattern: Optional[str], image_size: tuple[int, int], area_threshold: float = 0.0
+    ) -> list[dict[str, Any]]:
+        """
+        Parse OCR results with quadrilateral boxes.
+
+        Args:
+            text (`str`):
+                The generated text.
+            pattern (`str`):
+                Regex pattern for matching.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+            area_threshold (`float`, *optional*, defaults to 0.0):
+                Minimum area threshold for filtering boxes.
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'quad_box' and 'text'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+        if pattern is None:
+            pattern = r"(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>"
+
+        matches = re.findall(pattern, text)
+        instances = []
+        width, height = image_size
+
+        for content, *quad_str in matches:
+            quad_bins = [int(i) for i in quad_str]
+            quad_box = self.dequantize(torch.tensor(quad_bins).reshape(-1, 2), size=image_size).flatten().tolist()
+
+            if area_threshold > 0:
+                x_coords = quad_box[0::2]
+                y_coords = quad_box[1::2]
+                # Apply the Shoelace formula
+                area = 0.5 * abs(
+                    sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1))
+                )
+
+                if area < (width * height) * area_threshold:
+                    continue
+
+            instances.append({"quad_box": quad_box, "text": content.strip()})
+        return instances
+
+    def parse_phrase_grounding_from_text_and_spans(
+        self, text: str, image_size: tuple[int, int]
+    ) -> list[dict[str, Any]]:
+        """
+        Parse phrase grounding results.
+
+        Args:
+            text (`str`):
+                The generated text.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'bbox' and 'cat_name'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+        phrase_pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(phrase_pattern, text)
+        text_pattern = r"^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)"
+        box_pattern = r"<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>"
+
+        instances = []
+        for phrase_text in phrases:
+            phrase_text = phrase_text.replace("<ground>", "", 1).replace("<obj>", "", 1)
+            if not phrase_text:
+                continue
+            match = re.search(text_pattern, phrase_text)
+            if not match:
+                continue
+            phrase = match.group().strip()
+            if phrase in self.banned_grounding_tokens:
+                continue
+            boxes_matches = list(re.finditer(box_pattern, phrase_text))
+            if not boxes_matches:
+                continue
+            bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches]
+            bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist()
+            phrase = phrase.encode("ascii", "ignore").decode("ascii")
+            instances.append({"bbox": bboxes, "cat_name": phrase})
+        return instances
+
+    def _find_matched_token_indices(self, cur_span: tuple[int, int], token_spans: list[tuple[int, int]]) -> list[int]:
+        return [i for i, span in enumerate(token_spans) if not (span[1] <= cur_span[0] or span[0] >= cur_span[1])]
+
+    def parse_description_with_bboxes_from_text_and_spans(
+        self,
+        text: str,
+        image_size: tuple[int, int],
+        allow_empty_phrase: bool = False,
+    ) -> list[dict[str, Any]]:
+        """
+        Parse descriptions with bounding boxes.
+
+        Args:
+            text (`str`):
+                The generated text.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+            allow_empty_phrase (`bool`, *optional*, defaults to `False`):
+                Allow phrases without text.
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'bbox', 'cat_name', and optional 'score'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+
+        if allow_empty_phrase:
+            pattern = r"(?:(?:<loc_\d+>){4,})"
+        else:
+            pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(pattern, text)
+
+        text_pattern = r"^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)"
+        box_pattern = r"<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>"
+
+        instances = []
+        for phrase_text in phrases:
+            phrase_text = phrase_text.replace("<ground>", "", 1).replace("<obj>", "", 1)
+            if not phrase_text and not allow_empty_phrase:
+                continue
+            match = re.search(text_pattern, phrase_text)
+            if not match:
+                continue
+            phrase = match.group().strip()
+            boxes_matches = list(re.finditer(box_pattern, phrase_text))
+            if not boxes_matches:
+                continue
+            bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches]
+            bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist()
+
+            phrase = phrase.encode("ascii", "ignore").decode("ascii")
+            for bbox in bboxes:
+                instance = {"bbox": bbox, "cat_name": phrase}
+                instances.append(instance)
+
+        return instances
+
+    def parse_description_with_polygons_from_text_and_spans(
+        self,
+        text: str,
+        image_size: tuple[int, int],
+        allow_empty_phrase: bool = False,
+        polygon_sep_token: str = "<sep>",
+        polygon_start_token: str = "<poly>",
+        polygon_end_token: str = "</poly>",
+        with_box_at_start: bool = False,
+    ) -> list[dict[str, Any]]:
+        """
+        Parse descriptions with polygons.
+
+        Args:
+            text (`str`):
+                The generated text.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+            allow_empty_phrase (`bool`, *optional*, defaults to `False`):
+                Allow phrases without text.
+            polygon_sep_token (`str`, *optional*, defaults to "<sep>"):
+                Token separating polygons.
+            polygon_start_token (`str`, *optional*, defaults to "<poly>"):
+                Start token for polygons.
+            polygon_end_token (`str`, *optional*, defaults to "</poly>"):
+                End token for polygons.
+            with_box_at_start (`bool`, *optional*, defaults to `False`):
+                Whether a bounding box is at the start of polygons.
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'polygons', 'cat_name', and optional 'bbox'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+
+        if allow_empty_phrase:
+            pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        else:
+            pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        phrases = re.findall(pattern, text)
+        phrase_pattern = r"^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)"
+        poly_instance_pattern = rf"{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}"
+        box_pattern = rf"((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)"
+
+        instances = []
+        for phrase_text in phrases:
+            phrase_text_strip = re.sub(r"^<loc_\d+>", "", phrase_text, count=1)
+            if not phrase_text_strip and not allow_empty_phrase:
+                continue
+            match = re.search(phrase_pattern, phrase_text_strip)
+            if not match:
+                continue
+            phrase = match.group().strip()
+
+            if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
+                poly_instances = [m.group(1) for m in re.finditer(poly_instance_pattern, phrase_text)]
+            else:
+                poly_instances = [phrase_text]
+
+            for poly_inst in poly_instances:
+                poly_matches = list(re.finditer(box_pattern, poly_inst))
+                if len(poly_matches) == 0:
+                    continue
+                bbox = []
+                polygons = []
+                for poly_match in poly_matches:
+                    poly_str = poly_match.group(1)
+                    poly_bins = [int(m.group(1)) for m in re.finditer(r"<loc_(\d+)>", poly_str)]
+                    if with_box_at_start and not bbox:
+                        if len(poly_bins) > 4:
+                            bbox = poly_bins[:4]
+                            poly_bins = poly_bins[4:]
+                        else:
+                            bbox = [0, 0, 0, 0]
+                    if len(poly_bins) % 2 == 1:
+                        poly_bins = poly_bins[:-1]
+                    poly_coords = (
+                        self.dequantize(torch.tensor(poly_bins).reshape(-1, 2), size=image_size).flatten().tolist()
+                    )
+                    polygons.append(poly_coords)
+
+                instance = {"cat_name": phrase, "polygons": polygons}
+                if bbox:
+                    instance["bbox"] = self.dequantize(torch.tensor([bbox]), size=image_size)[0].tolist()
+                instances.append(instance)
+        return instances
+
+    def __call__(self, text=None, sequence=None, image_size=None, parse_tasks=None) -> dict[str, Any]:
+        """
+        Process model output and parse into task-specific results.
+
+        Args:
+            text (`Optional[str]`, *optional*):
+                Generated text. Either this or `sequence` must be provided.
+            sequence (`Optional[Union[list[int], torch.Tensor]]`, *optional*):
+                Token sequence. Either this or `text` must be provided.
+            image_size (`Optional[tuple[int, int]]`, *optional*):
+                Image size (width, height) required for dequantization.
+            parse_tasks (`Optional[Union[str, list[str]]]`, *optional*):
+                Specific tasks to parse. If None, parse all supported tasks.
+
+        Returns:
+            `dict[str, Any]`: Parsed results for each task, including the raw 'text'.
+        """
+        if parse_tasks is not None:
+            parse_tasks = [parse_tasks] if isinstance(parse_tasks, str) else parse_tasks
+            for task in parse_tasks:
+                if task not in self.parse_task_config.keys():
+                    raise ValueError(f"Unsupported parse task: {task}")
+
+        if (text is None and sequence is None) or (text is not None and sequence is not None):
+            raise ValueError("Exactly one of 'text' or 'sequence' must be provided.")
+
+        if sequence is not None:
+            if isinstance(sequence, torch.Tensor):
+                sequence = sequence.tolist()
+            sequence = sequence[1:] if sequence[0] == self.tokenizer.bos_token_id else sequence  # Skip BOS if present
+            text, _ = self.decode_with_spans(sequence)
+
+        parsed_dict = {"text": text}
+
+        tasks_to_parse = parse_tasks or self.parse_task_config.keys()
+        for task in tasks_to_parse:
+            config = self.parse_task_config[task]
+            pattern = config.get("PATTERN")
+
+            if task == "ocr":
+                parsed_dict["ocr"] = self.parse_ocr_from_text_and_spans(
+                    text, pattern=pattern, image_size=image_size, area_threshold=config.get("AREA_THRESHOLD", 0.0)
+                )
+            elif task == "phrase_grounding":
+                parsed_dict["phrase_grounding"] = self.parse_phrase_grounding_from_text_and_spans(
+                    text, image_size=image_size
+                )
+            elif task == "pure_text":
+                parsed_dict["pure_text"] = text
+            elif task == "description_with_bboxes":
+                parsed_dict["description_with_bboxes"] = self.parse_description_with_bboxes_from_text_and_spans(
+                    text, image_size=image_size
+                )
+            elif task == "description_with_polygons":
+                parsed_dict["description_with_polygons"] = self.parse_description_with_polygons_from_text_and_spans(
+                    text, image_size=image_size
+                )
+            elif task == "polygons":
+                parsed_dict["polygons"] = self.parse_description_with_polygons_from_text_and_spans(
+                    text, image_size=image_size, allow_empty_phrase=True
+                )
+            elif task == "bboxes":
+                parsed_dict["bboxes"] = self.parse_description_with_bboxes_from_text_and_spans(
+                    text, image_size=image_size, allow_empty_phrase=True
+                )
+            elif task == "description_with_bboxes_or_polygons":
+                if "<poly>" in text:
+                    instances = self.parse_description_with_polygons_from_text_and_spans(text, image_size=image_size)
+                else:
+                    instances = self.parse_description_with_bboxes_from_text_and_spans(text, image_size=image_size)
+                parsed_dict["description_with_bboxes_or_polygons"] = instances
+            else:
+                raise ValueError("task {} is not supported".format(task))
+
+        return parsed_dict
+
+
+class Florence2VisionDropPath(BeitDropPath):
+    pass
+
+
+class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, config: Florence2Config):
+        super().__init__()
+        num_pos = config.vision_config.max_position_embeddings
+        embedding_dim = config.vision_config.embed_dim[-1]
+        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+class Florence2VisionPositionalEmbeddingCosine1D(nn.Module):
+    """
+    This module generates 1D cosine positional embeddings using precomputed sinusoidal functions.
+    """
+
+    def __init__(self, config: Florence2Config):
+        super().__init__()
+        self.embed_dim = config.vision_config.embed_dim[-1]
+        self.max_seq_len = config.vision_config.max_temporal_embeddings
+        pos_idx_to_embed = torch.empty((self.max_seq_len, self.embed_dim))
+        sine, cosine = self.get_sinusoid_embeddings(
+            max_positions=self.max_seq_len,
+            embed_dim=self.embed_dim,
+        )
+        pos_idx_to_embed[:, 0::2] = sine
+        pos_idx_to_embed[:, 1::2] = cosine
+        # Save the positional embeddings in a constant buffer.
+        self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
+
+    @staticmethod
+    def get_sinusoid_embeddings(max_positions: int, embed_dim: int):
+        half_dim = embed_dim // 2
+        emb = math.log(10000) / half_dim
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        return torch.sin(emb), torch.cos(emb)
+
+    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
+        len_seq = seq_embeds.size(1)
+        if len_seq > self.max_seq_len:
+            raise ValueError(f"Maximum sequence length {self.max_seq_len}, got {len_seq}")
+        pos_embeds = self.pos_idx_to_embed[0:len_seq, :]
+        return pos_embeds
+
+
+class Florence2VisionMLP(Llama4VisionMLP):
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__(config)
+        self.fc1 = nn.Linear(config.embed_dim[stage_idx], int(config.embed_dim[stage_idx] * config.mlp_ratio))
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.fc2 = nn.Linear(int(config.embed_dim[stage_idx] * config.mlp_ratio), config.embed_dim[stage_idx])
+
+
+class Florence2VisionConvEmbed(nn.Module):
+    """Image to Patch Embedding"""
+
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__()
+        self.config = config
+        self.stage_idx = stage_idx
+        self.patch_size = config.patch_size[stage_idx]
+        self.in_channels = config.in_channels if stage_idx == 0 else config.embed_dim[stage_idx - 1]
+        self.embed_dim = config.embed_dim[stage_idx]
+        self.stride = config.patch_stride[stage_idx]
+        self.padding = config.patch_padding[stage_idx]
+        self.pre_norm = config.patch_prenorm[stage_idx]
+
+        self.conv = nn.Conv2d(
+            self.in_channels,
+            self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.stride,
+            padding=self.padding,
+        )
+
+        dim_norm = self.in_channels if self.pre_norm else self.embed_dim
+        self.norm = nn.LayerNorm(dim_norm)
+
+    def forward(self, hidden_states: torch.Tensor):
+        if self.norm and self.pre_norm:
+            hidden_states = hidden_states.permute(0, 2, 3, 1)
+            hidden_states = self.norm(hidden_states)
+            hidden_states = hidden_states.permute(0, 3, 1, 2)
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm and not self.pre_norm:
+            hidden_states = hidden_states.permute(0, 2, 3, 1)
+            hidden_states = self.norm(hidden_states)
+            hidden_states = hidden_states.permute(0, 3, 1, 2)
+        return hidden_states
+
+
+class Florence2VisionChannelAttention(nn.Module):
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__()
+        self.config = config
+        self.dim = config.embed_dim[stage_idx]
+        self.groups = config.num_groups[stage_idx]
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.is_causal = False
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, num_tokens, hidden_size = hidden_states.shape
+
+        # Reshape for grouped channel attention
+        qkv = self.qkv(hidden_states).reshape(batch_size, num_tokens, 3, self.groups, hidden_size // self.groups)
+        qkv = qkv.permute(2, 0, 3, 4, 1)
+        query, key, value = qkv.unbind(0)
+
+        scale = num_tokens**-0.5
+        # Channel-to-channel attention within groups:
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        hidden_states, _ = attention_interface(
+            self,
+            query,
+            key,
+            value,
+            attention_mask=None,
+            scaling=scale,
+        )
+        hidden_states = hidden_states.permute(0, 3, 2, 1)
+        hidden_states = hidden_states.reshape(batch_size, num_tokens, hidden_size)
+
+        # Final projection
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+
+class Florence2VisionChannelBlock(nn.Module):
+    def __init__(
+        self,
+        config: Florence2VisionConfig,
+        stage_idx: int,
+        drop_path_rate: float,
+    ):
+        super().__init__()
+
+        self.config = config
+        dim_in = config.embed_dim[stage_idx]
+
+        self.conv1 = nn.Conv2d(
+            dim_in,
+            dim_in,
+            kernel_size=3,
+            padding=1,
+            groups=dim_in,
+        )
+        self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.channel_attn = Florence2VisionChannelAttention(config=config, stage_idx=stage_idx)
+        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+        self.conv2 = nn.Conv2d(
+            dim_in,
+            dim_in,
+            kernel_size=3,
+            padding=1,
+            groups=dim_in,
+        )
+        self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx)
+        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, embed_dim, height, width = hidden_states.shape
+
+        # First channel block: Depthwise Conv + Channel Attention
+        hidden_states = self.conv1(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # Channel group attention self-attention mechanism
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.channel_attn(hidden_states)
+        hidden_states = residual + self.drop_path1(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        # Second channel block: Depthwise Conv + FFN
+        hidden_states = self.conv2(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # FFN
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = residual + self.drop_path2(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        return hidden_states
+
+
+class Florence2VisionWindowAttention(nn.Module):
+    def __init__(self, config: Florence2VisionConfig, stage_idx: int):
+        super().__init__()
+        self.config = config
+        self.dim = config.embed_dim[stage_idx]
+        self.window_size = config.window_size
+        self.num_heads = config.num_heads[stage_idx]
+        head_dim = self.dim // self.num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.is_causal = False
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, height, width, embed_dim = hidden_states.shape
+
+        # Pad the input if necessary
+        pad_left = pad_top = 0
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        hidden_states = F.pad(hidden_states, (0, 0, pad_left, pad_right, pad_top, pad_bottom))
+        _, padded_height, padded_width, _ = hidden_states.shape
+
+        # Partition input into non-overlapping windows (for local spatial attention in DaViT)
+        hidden_states = hidden_states.view(
+            batch_size,
+            padded_height // self.window_size,
+            self.window_size,
+            padded_width // self.window_size,
+            self.window_size,
+            embed_dim,
+        )
+        windowed_hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size * self.window_size, embed_dim)
+
+        # Generate Q, K, V for each window
+        num_windows_per_batch, num_tokens_per_window, embed_dim = windowed_hidden_states.shape
+        qkv = self.qkv(windowed_hidden_states).reshape(
+            num_windows_per_batch, num_tokens_per_window, 3, self.num_heads, embed_dim // self.num_heads
+        )
+        qkv = qkv.permute(2, 0, 3, 1, 4)
+        query, key, value = qkv.unbind(0)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        windowed_hidden_states, _ = attention_interface(
+            self,
+            query,
+            key,
+            value,
+            attention_mask=None,
+            scaling=self.scale,
+        )
+        windowed_hidden_states = windowed_hidden_states.view(num_windows_per_batch, num_tokens_per_window, embed_dim)
+        windowed_hidden_states = self.proj(windowed_hidden_states)
+
+        # Merge windows back to original spatial layout
+        windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size, self.window_size, embed_dim)
+        hidden_states = windowed_hidden_states.view(
+            -1,
+            padded_height // self.window_size,
+            padded_width // self.window_size,
+            self.window_size,
+            self.window_size,
+            embed_dim,
+        )
+        hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous()
+        hidden_states = hidden_states.view(-1, padded_height, padded_width, embed_dim)
+        hidden_states = hidden_states[:, :height, :width, :].contiguous()
+        hidden_states = hidden_states.view(batch_size, height * width, embed_dim)
+
+        return hidden_states
+
+
+class Florence2VisionSpatialBlock(nn.Module):
+    def __init__(
+        self,
+        config: Florence2VisionConfig,
+        stage_idx: int,
+        drop_path_rate: float,
+    ):
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            config.embed_dim[stage_idx],
+            config.embed_dim[stage_idx],
+            kernel_size=3,
+            padding=1,
+            groups=config.embed_dim[stage_idx],
+        )
+        self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.window_attn = Florence2VisionWindowAttention(config=config, stage_idx=stage_idx)
+        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+        self.conv2 = nn.Conv2d(
+            config.embed_dim[stage_idx],
+            config.embed_dim[stage_idx],
+            kernel_size=3,
+            padding=1,
+            groups=config.embed_dim[stage_idx],
+        )
+        self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx])
+        self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx)
+        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, embed_dim, height, width = hidden_states.shape
+
+        # First spatial mixing block: Conv + Window Attention
+        hidden_states = self.conv1(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # Spatial Window-based self-attention mechanism
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = hidden_states.view(batch_size, height, width, embed_dim)
+        hidden_states = self.window_attn(hidden_states)
+        hidden_states = residual + self.drop_path1(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        # Second spatial mixing block: Conv + FFN
+        hidden_states = self.conv2(hidden_states) + hidden_states
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        residual = hidden_states
+
+        # FFN
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = residual + self.drop_path2(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width)
+
+        return hidden_states
+
+
+class Florence2VisionBlock(nn.Module):
+    def __init__(
+        self,
+        config: Florence2VisionConfig,
+        stage_idx: int,
+        spatial_drop_path_rate: float,
+        channel_drop_path_rate: float,
+    ):
+        super().__init__()
+        self.spatial_block = Florence2VisionSpatialBlock(
+            config=config,
+            stage_idx=stage_idx,
+            drop_path_rate=spatial_drop_path_rate,
+        )
+        self.channel_block = Florence2VisionChannelBlock(
+            config=config,
+            stage_idx=stage_idx,
+            drop_path_rate=channel_drop_path_rate,
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.spatial_block(hidden_states)
+        hidden_states = self.channel_block(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class Florence2VisionPreTrainedModel(PreTrainedModel):
+    config_class = Florence2VisionConfig
+    main_input_name = "pixel_values"
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+
+
+@auto_docstring
+class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
+    def __init__(self, config: Florence2VisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embed_dim = config.embed_dim
+        self.num_heads = config.num_heads
+        self.num_groups = config.num_groups
+        self.num_stages = len(self.embed_dim)
+
+        if not (self.num_stages == len(self.num_heads) == len(self.num_groups)):
+            raise ValueError(
+                f"Expected self.num_stages ({self.num_stages}) == "
+                f"len(self.num_heads) ({len(self.num_heads)}) == "
+                f"len(self.num_groups) ({len(self.num_groups)})"
+            )
+
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths) * 2, device="cpu")]
+        depth_offset = 0
+
+        convs = []
+        blocks = []
+        for stage_idx in range(self.num_stages):
+            conv_embed = Florence2VisionConvEmbed(
+                config=config,
+                stage_idx=stage_idx,
+            )
+            convs.append(conv_embed)
+
+            block = nn.ModuleList(
+                Florence2VisionBlock(
+                    config=config,
+                    stage_idx=stage_idx,
+                    spatial_drop_path_rate=dpr[depth_offset + block_idx * 2],
+                    channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1],
+                )
+                for block_idx in range(config.depths[stage_idx])
+            )
+            blocks.append(block)
+            depth_offset += config.depths[stage_idx] * 2
+
+        self.convs = nn.ModuleList(convs)
+        self.blocks = nn.ModuleList(blocks)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self, hidden_states: torch.Tensor):
+        for conv, block in zip(self.convs, self.blocks):
+            hidden_states = conv(hidden_states)
+            for layer in block:
+                hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class Florence2MultiModalProjector(nn.Module):
+    def __init__(self, config: Florence2Config):
+        super().__init__()
+        self.vision_embedding_dim = config.vision_config.embed_dim[-1]
+        self.vision_projection_dim = config.vision_config.projection_dim
+        self.image_projection = nn.Linear(self.vision_embedding_dim, self.vision_projection_dim, bias=False)
+        self.image_proj_norm = nn.LayerNorm(self.vision_projection_dim)
+        self.image_position_embed = Florence2VisionLearnedAbsolutePositionEmbedding2D(config=config)
+        self.visual_temporal_embed = Florence2VisionPositionalEmbeddingCosine1D(config=config)
+
+    def forward(self, image_features):
+        position_features = image_features + self.image_position_embed(image_features)
+        position_features = position_features.flatten(2).transpose(1, 2)
+        temporal_features = self.visual_temporal_embed(position_features[:, :1, :])
+        temporal_features = temporal_features.unsqueeze(1)
+        visual_token_features = position_features + temporal_features
+        visual_token_features = visual_token_features.unsqueeze(1)
+        spatial_image_features = visual_token_features.mean(dim=2)
+        temporal_image_features = visual_token_features.mean(dim=1)
+        image_features = torch.cat([spatial_image_features, temporal_image_features], dim=1)
+        image_features = self.image_projection(image_features)
+        image_features = self.image_proj_norm(image_features)
+        return image_features
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Florence-2 base model's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+    """
+)
+class Florence2Seq2SeqModelOutput(Seq2SeqModelOutput):
+    r"""
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Florence-2 model's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+    """
+)
+class Florence2Seq2SeqLMOutput(Seq2SeqLMOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@auto_docstring
+class Florence2PreTrainedModel(LlavaPreTrainedModel):
+    config_class = Florence2Config
+
+    _supports_attention_backend = False
+
+
+@auto_docstring(
+    custom_intro="""
+    Florence-2 is a vision model for captioning, detection, and segmentation.
+    """
+)
+class Florence2Model(LlavaModel):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = [
+        "language_model.encoder.embed_tokens.weight",
+        "language_model.decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: Florence2Config):
+        super().__init__(config)
+        self.vision_tower = Florence2VisionBackbone(config=config.vision_config)
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_image_features(self, pixel_values: torch.Tensor, **kwargs):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_features = self.vision_tower(pixel_values, **kwargs)
+        image_embeds = self.multi_modal_projector(image_features)
+        return image_embeds
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[list[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Florence2Seq2SeqModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            if (input_ids is None) ^ (inputs_embeds is not None):
+                raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            if pixel_values is not None:
+                image_features = self.get_image_features(pixel_values)
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                special_image_mask = self.get_placeholder_mask(
+                    input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+            encoder_outputs = self.language_model.encoder(
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+            )
+
+        if decoder_input_ids is None:
+            decoder_start_token_id = self.config.text_config.decoder_start_token_id
+            decoder_input_ids = torch.ones((inputs_embeds.size()[0], 1), dtype=torch.long, device=inputs_embeds.device)
+            decoder_input_ids *= decoder_start_token_id
+
+        decoder_outputs = self.language_model.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=True,
+            **kwargs,
+        )
+
+        return Florence2Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Florence-2 is a vision model for captioning, detection, and segmentation.
+    """
+)
+class Florence2ForConditionalGeneration(LlavaForConditionalGeneration):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = [
+        "model.language_model.encoder.embed_tokens.weight",
+        "model.language_model.decoder.embed_tokens.weight",
+        "lm_head.weight",
+    ]
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_image_features(self, pixel_values: torch.Tensor, **kwargs):
+        return self.model.get_image_features(pixel_values=pixel_values, **kwargs)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[list[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Florence2Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Florence2ForConditionalGeneration
+
+        >>> model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-large")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large")
+
+        >>> prompt = "<CAPTION>"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=100)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "A green car parked in front of a yellow building."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Florence2Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        return self.model.get_placeholder_mask(
+            input_ids=input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+        )
+
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self,
+        inputs_tensor: torch.Tensor,
+        model_kwargs,
+        model_input_name: Optional[str],
+        generation_config,
+    ) -> dict[str, Any]:
+        # override to handle merging image and text embeddings before passing to language encoder
+        inputs_embeds = model_kwargs.pop("inputs_embeds", None)
+        pixel_values = model_kwargs.pop("pixel_values", None)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(inputs_tensor)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                inputs_tensor, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        model_kwargs["inputs_embeds"] = inputs_embeds
+        model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation(
+            None, model_kwargs, model_input_name, generation_config
+        )
+        model_kwargs.pop("inputs_embeds", None)
+        return model_kwargs
+
+
+__all__ = [
+    "Florence2Config",
+    "Florence2Processor",
+    "Florence2VisionConfig",
+    "Florence2Model",
+    "Florence2ForConditionalGeneration",
+    "Florence2PreTrainedModel",
+    "Florence2VisionBackbone",
+    "Florence2VisionPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/florence2/processing_florence2.py b/phivenv/Lib/site-packages/transformers/models/florence2/processing_florence2.py
new file mode 100644
index 0000000000000000000000000000000000000000..96dd81a68ab6db4b8f58aab94383be998298bc90
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/florence2/processing_florence2.py
@@ -0,0 +1,803 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/florence2/modular_florence2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_florence2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Any, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class Florence2ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
+        "images_kwargs": {},
+    }
+
+
+class Florence2Processor(ProcessorMixin):
+    r"""
+    Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
+
+    [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the
+    [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
+
+    Args:
+        image_processor (`AutoImageProcessor`, *optional*):
+            The image processor is a required input.
+        tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*):
+            The tokenizer is a required input.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        post_processor_config (`dict`,  *optional*, defaults to 0):
+            Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns,
+            thresholds, or banned tokens.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        num_additional_image_tokens: int = 0,
+        post_processor_config: Optional[dict] = None,
+        **kwargs,
+    ):
+        self.tasks_answer_post_processing_type = {
+            "<OCR>": "pure_text",
+            "<OCR_WITH_REGION>": "ocr",
+            "<CAPTION>": "pure_text",
+            "<DETAILED_CAPTION>": "pure_text",
+            "<MORE_DETAILED_CAPTION>": "pure_text",
+            "<OD>": "description_with_bboxes",
+            "<DENSE_REGION_CAPTION>": "description_with_bboxes",
+            "<CAPTION_TO_PHRASE_GROUNDING>": "phrase_grounding",
+            "<REFERRING_EXPRESSION_SEGMENTATION>": "polygons",
+            "<REGION_TO_SEGMENTATION>": "polygons",
+            "<OPEN_VOCABULARY_DETECTION>": "description_with_bboxes_or_polygons",
+            "<REGION_TO_CATEGORY>": "pure_text",
+            "<REGION_TO_DESCRIPTION>": "pure_text",
+            "<REGION_TO_OCR>": "pure_text",
+            "<REGION_PROPOSAL>": "bboxes",
+        }
+
+        self.task_prompts_without_inputs = {
+            "<OCR>": "What is the text in the image?",
+            "<OCR_WITH_REGION>": "What is the text in the image, with regions?",
+            "<CAPTION>": "What does the image describe?",
+            "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.",
+            "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.",
+            "<OD>": "Locate the objects with category name in the image.",
+            "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.",
+            "<REGION_PROPOSAL>": "Locate the region proposals in the image.",
+        }
+
+        self.task_prompts_with_input = {
+            "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}",
+            "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask",
+            "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}",
+            "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.",
+            "<REGION_TO_CATEGORY>": "What is the region {input}?",
+            "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?",
+            "<REGION_TO_OCR>": "What text is in the region {input}?",
+        }
+
+        self.num_image_tokens = image_processor.image_seq_length
+        self.num_additional_image_tokens = num_additional_image_tokens
+        self.post_processor_config = post_processor_config
+        self.post_processor = Florence2PostProcessor(config=post_processor_config, tokenizer=tokenizer)
+        self.image_token = tokenizer.image_token
+        self.image_token_id = tokenizer.image_token_id
+
+        super().__init__(image_processor, tokenizer, **kwargs)
+
+    def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]:
+        """
+        Construct prompts by replacing task tokens with corresponding prompt strings.
+        """
+        if isinstance(text, str):
+            text = [text]
+
+        prompts = []
+        for prompt in text:
+            # Check for tasks without inputs
+            for task_token, task_prompt in self.task_prompts_without_inputs.items():
+                if task_token in prompt:
+                    if prompt != task_token:
+                        raise ValueError(f"Task token {task_token} should be the only content in the prompt.")
+                    prompt = task_prompt
+                    break
+            # Check for tasks with inputs
+            for task_token, task_prompt in self.task_prompts_with_input.items():
+                if task_token in prompt:
+                    input_text = prompt.replace(task_token, "").strip()
+                    prompt = task_prompt.format(input=input_text)
+                    break
+            prompts.append(prompt)
+        return prompts
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[Florence2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+
+        output_kwargs = self._merge_kwargs(
+            Florence2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+        if text is None:
+            logger.warning_once("You are using Florence-2 without a text prefix.")
+            text = [""] * (1 if not isinstance(images, list) else len(images))
+        elif isinstance(text, str):
+            text = [text]
+
+        if not isinstance(text, list) or not all(isinstance(token, str) for token in text):
+            raise ValueError("`text` must be a string or list of strings.")
+
+        if isinstance(images, list) and len(images) != len(text):
+            raise ValueError(f"Number of images ({len(images)}) must match number of texts ({len(text)}).")
+
+        prompt_strings = self._construct_prompts(text)
+
+        # Add image tokens and special tokens if images are provided
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            expanded_image_prompts = []
+            for sample in prompt_strings:
+                sample = (
+                    self.image_token * self.num_image_tokens
+                    + self.tokenizer.bos_token
+                    + sample
+                    + self.tokenizer.eos_token
+                )
+                expanded_image_prompts.append(sample)
+            prompt_strings = expanded_image_prompts
+
+        # Construct and tokenize prompts
+        output_kwargs["text_kwargs"].pop("add_special_tokens", None)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(
+            prompt_strings, **output_kwargs["text_kwargs"], add_special_tokens=False, return_tensors=None
+        )
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**image_inputs, **text_inputs}, tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            num_image_tokens = [self.image_seq_length] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=False, **kwargs):
+        """
+        Post-processes the output of `FuyuForConditionalGeneration` to only return the text output.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                containing the token ids of the generated sequences.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text output.
+        """
+        return self.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def post_process_generation(self, text=None, sequence=None, task=None, image_size=None) -> dict[str, Any]:
+        """
+        Post-process generation outputs based on the task.
+
+        Args:
+            text (`str`, *optional*):
+                Generated text.
+            sequence (`Union[List[int], torch.Tensor]`, *optional*):
+                Generated token sequence.
+            task (`str`, *optional*):
+                The task for post-processing.
+            image_size (`Tuple[int, int]`, *optional*):
+                Image size for dequantization.
+
+        Returns:
+            `Dict[str, Any]`: Post-processed results keyed by task.
+        """
+        if task is None:
+            raise ValueError("`task` must be provided for post-processing.")
+
+        post_proc_type = self.tasks_answer_post_processing_type.get(task, "pure_text")
+        parsed = self.post_processor(
+            text=text,
+            sequence=sequence,
+            image_size=image_size,
+            parse_tasks=[post_proc_type],
+        )[post_proc_type]
+
+        if post_proc_type == "pure_text":
+            final_answer = parsed.replace("<s>", "").replace("</s>", "").strip()
+        elif post_proc_type in ["description_with_bboxes", "bboxes"]:
+            bboxes = [inst["bbox"] for inst in parsed]
+            labels = [inst["cat_name"] for inst in parsed]
+            final_answer = {"bboxes": bboxes, "labels": labels}
+            if parsed and "score" in parsed[0]:
+                final_answer["scores"] = [inst["score"] for inst in parsed]
+        elif post_proc_type == "ocr":
+            quad_boxes = [inst["quad_box"] for inst in parsed]
+            labels = [inst["text"] for inst in parsed]
+            final_answer = {"quad_boxes": quad_boxes, "labels": labels}
+        elif post_proc_type == "phrase_grounding":
+            bboxes = []
+            labels = []
+            for inst in parsed:
+                for bbox in inst["bbox"]:
+                    bboxes.append(bbox)
+                    labels.append(inst["cat_name"])
+            final_answer = {"bboxes": bboxes, "labels": labels}
+        elif post_proc_type in ["description_with_polygons", "polygons"]:
+            polygons = [inst["polygons"] for inst in parsed]
+            labels = [inst["cat_name"] for inst in parsed]
+            final_answer = {"polygons": polygons, "labels": labels}
+        elif post_proc_type == "description_with_bboxes_or_polygons":
+            bboxes = []
+            bboxes_labels = []
+            polygons = []
+            polygons_labels = []
+            for inst in parsed:
+                label = inst["cat_name"]
+                if "polygons" in inst:
+                    polygons.append(inst["polygons"])
+                    polygons_labels.append(label)
+                else:
+                    bboxes.append(inst["bbox"])
+                    bboxes_labels.append(label)
+            final_answer = {
+                "bboxes": bboxes,
+                "bboxes_labels": bboxes_labels,
+                "polygons": polygons,
+                "polygons_labels": polygons_labels,
+            }
+        else:
+            raise ValueError(f"Unknown post-processing type: {post_proc_type}")
+
+        return {task: final_answer}
+
+
+class Florence2PostProcessor:
+    """
+    Post-processor for Florence-2 model outputs. Parses generated text into structured results for various tasks
+    like object detection, OCR, phrase grounding, etc.
+
+    Args:
+        tokenizer (`PreTrainedTokenizer`):
+            The tokenizer used for decoding model outputs.
+    """
+
+    def __init__(self, config, tokenizer):
+        self.tokenizer = tokenizer
+        self.parse_task_config = config or {}
+        self.banned_grounding_tokens = set(
+            self.parse_task_config.get("phrase_grounding", {}).get("banned_grounding_tokens", [])
+        )
+        self.all_special_tokens = set(self.tokenizer.all_special_tokens)
+        self.quantize_bins = (1000, 1000)
+
+    def quantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor":
+        """
+        Quantize locations.
+
+        Args:
+            locations (`torch.Tensor`):
+                Tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates.
+            size (`tuple[int, int]`):
+                Original image size (width, height).
+
+        Returns:
+            `torch.Tensor`: Quantized locations as integers.
+        """
+        bins_w, bins_h = self.quantize_bins
+        size_w, size_h = size
+        per_bin_w = size_w / bins_w
+        per_bin_h = size_h / bins_h
+
+        if locations.shape[-1] == 4:  # Bounding boxes: [xmin, ymin, xmax, ymax]
+            xmin, ymin, xmax, ymax = locations.split(1, dim=-1)
+            q_xmin = (xmin / per_bin_w).floor().clamp(0, bins_w - 1)
+            q_ymin = (ymin / per_bin_h).floor().clamp(0, bins_h - 1)
+            q_xmax = (xmax / per_bin_w).floor().clamp(0, bins_w - 1)
+            q_ymax = (ymax / per_bin_h).floor().clamp(0, bins_h - 1)
+            return torch.cat([q_xmin, q_ymin, q_xmax, q_ymax], dim=-1).int()
+
+        elif locations.shape[-1] == 2:  # Points/coordinates: [x, y]
+            x, y = locations.split(1, dim=-1)
+            q_x = (x / per_bin_w).floor().clamp(0, bins_w - 1)
+            q_y = (y / per_bin_h).floor().clamp(0, bins_h - 1)
+            return torch.cat([q_x, q_y], dim=-1).int()
+
+        else:
+            raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.")
+
+    def dequantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor":
+        """
+        Dequantize locations back to original scale.
+
+        Args:
+            locations (`torch.Tensor`):
+                Quantized tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates.
+            size (`tuple[int, int]`):
+                Original image size (width, height).
+
+        Returns:
+            `torch.Tensor`: Dequantized locations as floats.
+        """
+        bins_w, bins_h = self.quantize_bins
+        size_w, size_h = size
+        per_bin_w = size_w / bins_w
+        per_bin_h = size_h / bins_h
+
+        # Add 0.5 to use the center position of the bin as the coordinate.
+        if locations.shape[-1] == 4:  # Bounding boxes
+            xmin, ymin, xmax, ymax = locations.split(1, dim=-1)
+            dq_xmin = (xmin + 0.5) * per_bin_w
+            dq_ymin = (ymin + 0.5) * per_bin_h
+            dq_xmax = (xmax + 0.5) * per_bin_w
+            dq_ymax = (ymax + 0.5) * per_bin_h
+            return torch.cat([dq_xmin, dq_ymin, dq_xmax, dq_ymax], dim=-1).int()
+
+        elif locations.shape[-1] == 2:  # Points/coordinates
+            x, y = locations.split(1, dim=-1)
+            dq_x = (x + 0.5) * per_bin_w
+            dq_y = (y + 0.5) * per_bin_h
+            return torch.cat([dq_x, dq_y], dim=-1).int()
+
+        else:
+            raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.")
+
+    def decode_with_spans(self, token_ids: list[int]) -> tuple[str, list[tuple[int, int]]]:
+        """
+        Decode token IDs to text and compute character spans.
+
+        Args:
+            token_ids (`list[int]`):
+                list of token IDs to decode.
+
+        Returns:
+            `tuple[str, list[tuple[int, int]]]`: Decoded text and list of spans (start, end) for each token.
+        """
+        filtered_tokens = self.tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=False)
+        text = ""
+        spans = []
+        for token in filtered_tokens:
+            if token in self.all_special_tokens:
+                sub_text = token
+            else:
+                sub_text = self.tokenizer.convert_tokens_to_string([token])
+            span = (len(text), len(text) + len(sub_text))
+            text += sub_text
+            spans.append(span)
+        return text, spans
+
+    def parse_ocr_from_text_and_spans(
+        self, text: str, pattern: Optional[str], image_size: tuple[int, int], area_threshold: float = 0.0
+    ) -> list[dict[str, Any]]:
+        """
+        Parse OCR results with quadrilateral boxes.
+
+        Args:
+            text (`str`):
+                The generated text.
+            pattern (`str`):
+                Regex pattern for matching.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+            area_threshold (`float`, *optional*, defaults to 0.0):
+                Minimum area threshold for filtering boxes.
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'quad_box' and 'text'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+        if pattern is None:
+            pattern = r"(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>"
+
+        matches = re.findall(pattern, text)
+        instances = []
+        width, height = image_size
+
+        for content, *quad_str in matches:
+            quad_bins = [int(i) for i in quad_str]
+            quad_box = self.dequantize(torch.tensor(quad_bins).reshape(-1, 2), size=image_size).flatten().tolist()
+
+            if area_threshold > 0:
+                x_coords = quad_box[0::2]
+                y_coords = quad_box[1::2]
+                # Apply the Shoelace formula
+                area = 0.5 * abs(
+                    sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1))
+                )
+
+                if area < (width * height) * area_threshold:
+                    continue
+
+            instances.append({"quad_box": quad_box, "text": content.strip()})
+        return instances
+
+    def parse_phrase_grounding_from_text_and_spans(
+        self, text: str, image_size: tuple[int, int]
+    ) -> list[dict[str, Any]]:
+        """
+        Parse phrase grounding results.
+
+        Args:
+            text (`str`):
+                The generated text.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'bbox' and 'cat_name'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+        phrase_pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(phrase_pattern, text)
+        text_pattern = r"^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)"
+        box_pattern = r"<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>"
+
+        instances = []
+        for phrase_text in phrases:
+            phrase_text = phrase_text.replace("<ground>", "", 1).replace("<obj>", "", 1)
+            if not phrase_text:
+                continue
+            match = re.search(text_pattern, phrase_text)
+            if not match:
+                continue
+            phrase = match.group().strip()
+            if phrase in self.banned_grounding_tokens:
+                continue
+            boxes_matches = list(re.finditer(box_pattern, phrase_text))
+            if not boxes_matches:
+                continue
+            bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches]
+            bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist()
+            phrase = phrase.encode("ascii", "ignore").decode("ascii")
+            instances.append({"bbox": bboxes, "cat_name": phrase})
+        return instances
+
+    def _find_matched_token_indices(self, cur_span: tuple[int, int], token_spans: list[tuple[int, int]]) -> list[int]:
+        return [i for i, span in enumerate(token_spans) if not (span[1] <= cur_span[0] or span[0] >= cur_span[1])]
+
+    def parse_description_with_bboxes_from_text_and_spans(
+        self,
+        text: str,
+        image_size: tuple[int, int],
+        allow_empty_phrase: bool = False,
+    ) -> list[dict[str, Any]]:
+        """
+        Parse descriptions with bounding boxes.
+
+        Args:
+            text (`str`):
+                The generated text.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+            allow_empty_phrase (`bool`, *optional*, defaults to `False`):
+                Allow phrases without text.
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'bbox', 'cat_name', and optional 'score'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+
+        if allow_empty_phrase:
+            pattern = r"(?:(?:<loc_\d+>){4,})"
+        else:
+            pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(pattern, text)
+
+        text_pattern = r"^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)"
+        box_pattern = r"<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>"
+
+        instances = []
+        for phrase_text in phrases:
+            phrase_text = phrase_text.replace("<ground>", "", 1).replace("<obj>", "", 1)
+            if not phrase_text and not allow_empty_phrase:
+                continue
+            match = re.search(text_pattern, phrase_text)
+            if not match:
+                continue
+            phrase = match.group().strip()
+            boxes_matches = list(re.finditer(box_pattern, phrase_text))
+            if not boxes_matches:
+                continue
+            bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches]
+            bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist()
+
+            phrase = phrase.encode("ascii", "ignore").decode("ascii")
+            for bbox in bboxes:
+                instance = {"bbox": bbox, "cat_name": phrase}
+                instances.append(instance)
+
+        return instances
+
+    def parse_description_with_polygons_from_text_and_spans(
+        self,
+        text: str,
+        image_size: tuple[int, int],
+        allow_empty_phrase: bool = False,
+        polygon_sep_token: str = "<sep>",
+        polygon_start_token: str = "<poly>",
+        polygon_end_token: str = "</poly>",
+        with_box_at_start: bool = False,
+    ) -> list[dict[str, Any]]:
+        """
+        Parse descriptions with polygons.
+
+        Args:
+            text (`str`):
+                The generated text.
+            image_size (`tuple[int, int]`):
+                Image size (width, height).
+            allow_empty_phrase (`bool`, *optional*, defaults to `False`):
+                Allow phrases without text.
+            polygon_sep_token (`str`, *optional*, defaults to "<sep>"):
+                Token separating polygons.
+            polygon_start_token (`str`, *optional*, defaults to "<poly>"):
+                Start token for polygons.
+            polygon_end_token (`str`, *optional*, defaults to "</poly>"):
+                End token for polygons.
+            with_box_at_start (`bool`, *optional*, defaults to `False`):
+                Whether a bounding box is at the start of polygons.
+
+        Returns:
+            `list[dict[str, Any]]`: list of instances with 'polygons', 'cat_name', and optional 'bbox'.
+        """
+        text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+
+        if allow_empty_phrase:
+            pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        else:
+            pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        phrases = re.findall(pattern, text)
+        phrase_pattern = r"^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)"
+        poly_instance_pattern = rf"{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}"
+        box_pattern = rf"((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)"
+
+        instances = []
+        for phrase_text in phrases:
+            phrase_text_strip = re.sub(r"^<loc_\d+>", "", phrase_text, count=1)
+            if not phrase_text_strip and not allow_empty_phrase:
+                continue
+            match = re.search(phrase_pattern, phrase_text_strip)
+            if not match:
+                continue
+            phrase = match.group().strip()
+
+            if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
+                poly_instances = [m.group(1) for m in re.finditer(poly_instance_pattern, phrase_text)]
+            else:
+                poly_instances = [phrase_text]
+
+            for poly_inst in poly_instances:
+                poly_matches = list(re.finditer(box_pattern, poly_inst))
+                if len(poly_matches) == 0:
+                    continue
+                bbox = []
+                polygons = []
+                for poly_match in poly_matches:
+                    poly_str = poly_match.group(1)
+                    poly_bins = [int(m.group(1)) for m in re.finditer(r"<loc_(\d+)>", poly_str)]
+                    if with_box_at_start and not bbox:
+                        if len(poly_bins) > 4:
+                            bbox = poly_bins[:4]
+                            poly_bins = poly_bins[4:]
+                        else:
+                            bbox = [0, 0, 0, 0]
+                    if len(poly_bins) % 2 == 1:
+                        poly_bins = poly_bins[:-1]
+                    poly_coords = (
+                        self.dequantize(torch.tensor(poly_bins).reshape(-1, 2), size=image_size).flatten().tolist()
+                    )
+                    polygons.append(poly_coords)
+
+                instance = {"cat_name": phrase, "polygons": polygons}
+                if bbox:
+                    instance["bbox"] = self.dequantize(torch.tensor([bbox]), size=image_size)[0].tolist()
+                instances.append(instance)
+        return instances
+
+    def __call__(self, text=None, sequence=None, image_size=None, parse_tasks=None) -> dict[str, Any]:
+        """
+        Process model output and parse into task-specific results.
+
+        Args:
+            text (`Optional[str]`, *optional*):
+                Generated text. Either this or `sequence` must be provided.
+            sequence (`Optional[Union[list[int], torch.Tensor]]`, *optional*):
+                Token sequence. Either this or `text` must be provided.
+            image_size (`Optional[tuple[int, int]]`, *optional*):
+                Image size (width, height) required for dequantization.
+            parse_tasks (`Optional[Union[str, list[str]]]`, *optional*):
+                Specific tasks to parse. If None, parse all supported tasks.
+
+        Returns:
+            `dict[str, Any]`: Parsed results for each task, including the raw 'text'.
+        """
+        if parse_tasks is not None:
+            parse_tasks = [parse_tasks] if isinstance(parse_tasks, str) else parse_tasks
+            for task in parse_tasks:
+                if task not in self.parse_task_config.keys():
+                    raise ValueError(f"Unsupported parse task: {task}")
+
+        if (text is None and sequence is None) or (text is not None and sequence is not None):
+            raise ValueError("Exactly one of 'text' or 'sequence' must be provided.")
+
+        if sequence is not None:
+            if isinstance(sequence, torch.Tensor):
+                sequence = sequence.tolist()
+            sequence = sequence[1:] if sequence[0] == self.tokenizer.bos_token_id else sequence  # Skip BOS if present
+            text, _ = self.decode_with_spans(sequence)
+
+        parsed_dict = {"text": text}
+
+        tasks_to_parse = parse_tasks or self.parse_task_config.keys()
+        for task in tasks_to_parse:
+            config = self.parse_task_config[task]
+            pattern = config.get("PATTERN")
+
+            if task == "ocr":
+                parsed_dict["ocr"] = self.parse_ocr_from_text_and_spans(
+                    text, pattern=pattern, image_size=image_size, area_threshold=config.get("AREA_THRESHOLD", 0.0)
+                )
+            elif task == "phrase_grounding":
+                parsed_dict["phrase_grounding"] = self.parse_phrase_grounding_from_text_and_spans(
+                    text, image_size=image_size
+                )
+            elif task == "pure_text":
+                parsed_dict["pure_text"] = text
+            elif task == "description_with_bboxes":
+                parsed_dict["description_with_bboxes"] = self.parse_description_with_bboxes_from_text_and_spans(
+                    text, image_size=image_size
+                )
+            elif task == "description_with_polygons":
+                parsed_dict["description_with_polygons"] = self.parse_description_with_polygons_from_text_and_spans(
+                    text, image_size=image_size
+                )
+            elif task == "polygons":
+                parsed_dict["polygons"] = self.parse_description_with_polygons_from_text_and_spans(
+                    text, image_size=image_size, allow_empty_phrase=True
+                )
+            elif task == "bboxes":
+                parsed_dict["bboxes"] = self.parse_description_with_bboxes_from_text_and_spans(
+                    text, image_size=image_size, allow_empty_phrase=True
+                )
+            elif task == "description_with_bboxes_or_polygons":
+                if "<poly>" in text:
+                    instances = self.parse_description_with_polygons_from_text_and_spans(text, image_size=image_size)
+                else:
+                    instances = self.parse_description_with_bboxes_from_text_and_spans(text, image_size=image_size)
+                parsed_dict["description_with_bboxes_or_polygons"] = instances
+            else:
+                raise ValueError("task {} is not supported".format(task))
+
+        return parsed_dict
+
+
+__all__ = ["Florence2Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/__init__.py b/phivenv/Lib/site-packages/transformers/models/fnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..756d690e72c11e8075429e3666e8579f0666d074
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fnet/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_fnet import *
+    from .modeling_fnet import *
+    from .tokenization_fnet import *
+    from .tokenization_fnet_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70e3cf19a92c94ac6e456b785473e918e6aadd32
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/configuration_fnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/configuration_fnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38c6d7598612027f64c0755e4b5a64014fc50916
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/configuration_fnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/modeling_fnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/modeling_fnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bd4a4250f736112f2a9410ed4ec5323d757ade6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/modeling_fnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/tokenization_fnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/tokenization_fnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4ad2aa15fc316aba04defbf03dacc6c1cbb0483
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/tokenization_fnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/tokenization_fnet_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/tokenization_fnet_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8134c2d8ce18db727db0d1f8e3a140aaeb2767f8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fnet/__pycache__/tokenization_fnet_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/configuration_fnet.py b/phivenv/Lib/site-packages/transformers/models/fnet/configuration_fnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..24a578328565939b821cf8f9500559b2f585ea56
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fnet/configuration_fnet.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2021 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FNetModel`]. It is used to instantiate an FNet
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the FNet
+    [google/fnet-base](https://huggingface.co/google/fnet-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the FNet model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FNetModel`] or [`TFFNetModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 4):
+            The vocabulary size of the `token_type_ids` passed when calling [`FNetModel`] or [`TFFNetModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_tpu_fourier_optimizations (`bool`, *optional*, defaults to `False`):
+            Determines whether to use TPU optimized FFTs. If `True`, the model will favor axis-wise FFTs transforms.
+            Set to `False` for GPU/CPU hardware, in which case n-dimensional FFTs are used.
+        tpu_short_seq_length (`int`, *optional*, defaults to 512):
+            The sequence length that is expected by the model when using TPUs. This will be used to initialize the DFT
+            matrix only when *use_tpu_fourier_optimizations* is set to `True` and the input sequence is shorter than or
+            equal to 4096 tokens.
+
+    Example:
+
+    ```python
+    >>> from transformers import FNetConfig, FNetModel
+
+    >>> # Initializing a FNet fnet-base style configuration
+    >>> configuration = FNetConfig()
+
+    >>> # Initializing a model (with random weights) from the fnet-base style configuration
+    >>> model = FNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "fnet"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        num_hidden_layers=12,
+        intermediate_size=3072,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=4,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_tpu_fourier_optimizations=False,
+        tpu_short_seq_length=512,
+        pad_token_id=3,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_tpu_fourier_optimizations = use_tpu_fourier_optimizations
+        self.tpu_short_seq_length = tpu_short_seq_length
+
+
+__all__ = ["FNetConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/modeling_fnet.py b/phivenv/Lib/site-packages/transformers/models/fnet/modeling_fnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cb0e764b2a11326ac7a2ab4529e98ebfcfdfd7b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fnet/modeling_fnet.py
@@ -0,0 +1,1094 @@
+# coding=utf-8
+# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FNet model."""
+
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...utils import auto_docstring, is_scipy_available
+
+
+if is_scipy_available():
+    from scipy import linalg
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+    ModelOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward
+from ...utils import logging
+from .configuration_fnet import FNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Adapted from https://github.com/google-research/google-research/blob/master/f_net/fourier.py
+def _two_dim_matmul(x, matrix_dim_one, matrix_dim_two):
+    """Applies 2D matrix multiplication to 3D input arrays."""
+    seq_length = x.shape[1]
+    matrix_dim_one = matrix_dim_one[:seq_length, :seq_length]
+    x = x.type(torch.complex64)
+    return torch.einsum("bij,jk,ni->bnk", x, matrix_dim_two, matrix_dim_one)
+
+
+# # Adapted from https://github.com/google-research/google-research/blob/master/f_net/fourier.py
+def two_dim_matmul(x, matrix_dim_one, matrix_dim_two):
+    return _two_dim_matmul(x, matrix_dim_one, matrix_dim_two)
+
+
+# Adapted from https://github.com/google-research/google-research/blob/master/f_net/fourier.py
+def fftn(x):
+    """
+    Applies n-dimensional Fast Fourier Transform (FFT) to input array.
+
+    Args:
+        x: Input n-dimensional array.
+
+    Returns:
+        n-dimensional Fourier transform of input n-dimensional array.
+    """
+    out = x
+    for axis in reversed(range(x.ndim)[1:]):  # We don't need to apply FFT to last axis
+        out = torch.fft.fft(out, axis=axis)
+    return out
+
+
+class FNetEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # NOTE: This is the project layer and will be needed. The original code allows for different embedding and different model dimensions.
+        self.projection = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.projection(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class FNetBasicFourierTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self._init_fourier_transform(config)
+
+    def _init_fourier_transform(self, config):
+        if not config.use_tpu_fourier_optimizations:
+            self.fourier_transform = partial(torch.fft.fftn, dim=(1, 2))
+        elif config.max_position_embeddings <= 4096:
+            if is_scipy_available():
+                self.register_buffer(
+                    "dft_mat_hidden", torch.tensor(linalg.dft(config.hidden_size), dtype=torch.complex64)
+                )
+                self.register_buffer(
+                    "dft_mat_seq", torch.tensor(linalg.dft(config.tpu_short_seq_length), dtype=torch.complex64)
+                )
+                self.fourier_transform = partial(
+                    two_dim_matmul, matrix_dim_one=self.dft_mat_seq, matrix_dim_two=self.dft_mat_hidden
+                )
+            else:
+                logging.warning(
+                    "SciPy is needed for DFT matrix calculation and is not found. Using TPU optimized fast fourier"
+                    " transform instead."
+                )
+                self.fourier_transform = fftn
+        else:
+            self.fourier_transform = fftn
+
+    def forward(self, hidden_states):
+        # NOTE: We do not use torch.vmap as it is not integrated into PyTorch stable versions.
+        # Interested users can modify the code to use vmap from the nightly versions, getting the vmap from here:
+        # https://pytorch.org/docs/master/generated/torch.vmap.html. Note that fourier transform methods will need
+        # change accordingly.
+
+        outputs = self.fourier_transform(hidden_states).real
+        return (outputs,)
+
+
+class FNetBasicOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.LayerNorm(input_tensor + hidden_states)
+        return hidden_states
+
+
+class FNetFourierTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = FNetBasicFourierTransform(config)
+        self.output = FNetBasicOutput(config)
+
+    def forward(self, hidden_states):
+        self_outputs = self.self(hidden_states)
+        fourier_output = self.output(self_outputs[0], hidden_states)
+        outputs = (fourier_output,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->FNet
+class FNetIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->FNet
+class FNetOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class FNetLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1  # The dimension which has the sequence length
+        self.fourier = FNetFourierTransform(config)
+        self.intermediate = FNetIntermediate(config)
+        self.output = FNetOutput(config)
+
+    def forward(self, hidden_states):
+        self_fourier_outputs = self.fourier(hidden_states)
+        fourier_output = self_fourier_outputs[0]
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, fourier_output
+        )
+
+        outputs = (layer_output,)
+
+        return outputs
+
+    def feed_forward_chunk(self, fourier_output):
+        intermediate_output = self.intermediate(fourier_output)
+        layer_output = self.output(intermediate_output, fourier_output)
+        return layer_output
+
+
+class FNetEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([FNetLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states)
+
+            hidden_states = layer_outputs[0]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->FNet
+class FNetPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->FNet
+class FNetPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class FNetLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = FNetPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+    def _tie_weights(self) -> None:
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+            self.bias = self.decoder.bias
+
+
+class FNetOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = FNetLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->FNet
+class FNetOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->FNet
+class FNetPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = FNetLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@auto_docstring
+class FNetPreTrainedModel(PreTrainedModel):
+    config: FNetConfig
+    base_model_prefix = "fnet"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            # NOTE: Original code uses same initialization as weights for biases as well.
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`FNetForPreTraining`].
+    """
+)
+class FNetForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the masked language modeling loss and the next sequence prediction
+        (classification) loss.
+    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+        before SoftMax).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+class FNetModel(FNetPreTrainedModel):
+    """
+
+    The model can behave as an encoder, following the architecture described in [FNet: Mixing Tokens with Fourier
+    Transforms](https://huggingface.co/papers/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = FNetEmbeddings(config)
+        self.encoder = FNetEncoder(config)
+
+        self.pooler = FNetPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if (
+            self.config.use_tpu_fourier_optimizations
+            and seq_length <= 4096
+            and self.config.tpu_short_seq_length != seq_length
+        ):
+            raise ValueError(
+                "The `tpu_short_seq_length` in FNetConfig should be set equal to the sequence length being passed to"
+                " the model when using TPU optimizations."
+            )
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooler_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooler_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    FNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """
+)
+class FNetForPreTraining(FNetPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.fnet = FNetModel(config)
+        self.cls = FNetPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FNetForPreTrainingOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FNetForPreTraining
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
+        >>> model = FNetForPreTraining.from_pretrained("google/fnet-base")
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return FNetForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+@auto_docstring
+class FNetForMaskedLM(FNetPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.fnet = FNetModel(config)
+        self.cls = FNetOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states)
+
+
+@auto_docstring(
+    custom_intro="""
+    FNet Model with a `next sentence prediction (classification)` head on top.
+    """
+)
+class FNetForNextSentencePrediction(FNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.fnet = FNetModel(config)
+        self.cls = FNetOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FNetForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
+        >>> model = FNetForNextSentencePrediction.from_pretrained("google/fnet-base")
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```"""
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    FNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """
+)
+class FNetForSequenceClassification(FNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.fnet = FNetModel(config)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+@auto_docstring
+class FNetForMultipleChoice(FNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.fnet = FNetModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states)
+
+
+@auto_docstring
+class FNetForTokenClassification(FNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.fnet = FNetModel(config)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+@auto_docstring
+class FNetForQuestionAnswering(FNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+
+        self.fnet = FNetModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states
+        )
+
+
+__all__ = [
+    "FNetForMaskedLM",
+    "FNetForMultipleChoice",
+    "FNetForNextSentencePrediction",
+    "FNetForPreTraining",
+    "FNetForQuestionAnswering",
+    "FNetForSequenceClassification",
+    "FNetForTokenClassification",
+    "FNetLayer",
+    "FNetModel",
+    "FNetPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/tokenization_fnet.py b/phivenv/Lib/site-packages/transformers/models/fnet/tokenization_fnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..72aa202612e0bd451c413d071e84443c8c384e6e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fnet/tokenization_fnet.py
@@ -0,0 +1,314 @@
+# coding=utf-8
+# Copyright 2021 Google Research, Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for FNet model."""
+
+import os
+import unicodedata
+from shutil import copyfile
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+
+@requires(backends=("sentencepiece",))
+class FNetTokenizer(PreTrainedTokenizer):
+    """
+    Construct an FNet tokenizer. Adapted from [`AlbertTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece). This tokenizer inherits from [`PreTrainedTokenizer`]
+    which contains most of the main methods. Users should refer to this superclass for more information regarding those
+    methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `True`):
+            Whether or not to keep accents when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "token_type_ids"]
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
+        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
+        mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text: str) -> list[str]:
+        """Tokenize a string."""
+        text = self.preprocess_text(text)
+        pieces = self.sp_model.encode(text, out_type=str)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+
+        return new_pieces
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def _decode(
+        self,
+        token_ids: list[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        text = super()._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+        # Mimic the behavior of the Rust tokenizer:
+        # No space after <unk>
+        if not spaces_between_special_tokens:
+            text = text.replace("<unk> ", "<unk>")
+        return text
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An FNet sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["FNetTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fnet/tokenization_fnet_fast.py b/phivenv/Lib/site-packages/transformers/models/fnet/tokenization_fnet_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aab7997650f42bc0a7c185c5fa876a85c157a41
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fnet/tokenization_fnet_fast.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2021 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for FNet model."""
+
+import os
+from shutil import copyfile
+from typing import Optional
+
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_fnet import FNetTokenizer
+else:
+    FNetTokenizer = None
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+
+class FNetTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`AlbertTokenizerFast`]. Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
+    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `True`):
+            Whether or not to keep accents when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "token_type_ids"]
+    slow_tokenizer_class = FNetTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An FNet sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["FNetTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/focalnet/__init__.py b/phivenv/Lib/site-packages/transformers/models/focalnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dec8135f3b3030b20691e761483e5994ba441f0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/focalnet/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_focalnet import *
+    from .modeling_focalnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a4fd337f666974dbcc89370f6a50515c6110650
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efaa5674abe714b3c6a555dd48425a86b62b5dbc
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9589e8a88c21d4d5f9ab27e0c750d3e816a0024f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/focalnet/configuration_focalnet.py b/phivenv/Lib/site-packages/transformers/models/focalnet/configuration_focalnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e412e3824e163379abbca82e31f5af01cb18c7fc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/focalnet/configuration_focalnet.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FocalNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FocalNetModel`]. It is used to instantiate a
+    FocalNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the FocalNet
+    [microsoft/focalnet-tiny](https://huggingface.co/microsoft/focalnet-tiny) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch in the embeddings layer.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        use_conv_embed (`bool`, *optional*, defaults to `False`):
+            Whether to use convolutional embedding. The authors noted that using convolutional embedding usually
+            improve the performance, but it's not used by default.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[192, 384, 768, 768]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depth (number of layers) of each stage in the encoder.
+        focal_levels (`list(int)`, *optional*, defaults to `[2, 2, 2, 2]`):
+            Number of focal levels in each layer of the respective stages in the encoder.
+        focal_windows (`list(int)`, *optional*, defaults to `[3, 3, 3, 3]`):
+            Focal window size in each layer of the respective stages in the encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        use_layerscale (`bool`, *optional*, defaults to `False`):
+            Whether to use layer scale in the encoder.
+        layerscale_value (`float`, *optional*, defaults to 0.0001):
+            The initial value of the layer scale.
+        use_post_layernorm (`bool`, *optional*, defaults to `False`):
+            Whether to use post layer normalization in the encoder.
+        use_post_layernorm_in_modulation (`bool`, *optional*, defaults to `False`):
+            Whether to use post layer normalization in the modulation layer.
+        normalize_modulator (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the modulator.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        encoder_stride (`int`, *optional*, defaults to 32):
+            Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import FocalNetConfig, FocalNetModel
+
+    >>> # Initializing a FocalNet microsoft/focalnet-tiny style configuration
+    >>> configuration = FocalNetConfig()
+
+    >>> # Initializing a model (with random weights) from the microsoft/focalnet-tiny style configuration
+    >>> model = FocalNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "focalnet"
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        use_conv_embed=False,
+        hidden_sizes=[192, 384, 768, 768],
+        depths=[2, 2, 6, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        hidden_act="gelu",
+        mlp_ratio=4.0,
+        hidden_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        use_layerscale=False,
+        layerscale_value=1e-4,
+        use_post_layernorm=False,
+        use_post_layernorm_in_modulation=False,
+        normalize_modulator=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        encoder_stride=32,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.use_conv_embed = use_conv_embed
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.focal_levels = focal_levels
+        self.focal_windows = focal_windows
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.use_layerscale = use_layerscale
+        self.layerscale_value = layerscale_value
+        self.use_post_layernorm = use_post_layernorm
+        self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation
+        self.normalize_modulator = normalize_modulator
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.encoder_stride = encoder_stride
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["FocalNetConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/focalnet/modeling_focalnet.py b/phivenv/Lib/site-packages/transformers/models/focalnet/modeling_focalnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..99cba945cfe084bb76b96a2a0851cc3a9c85c8e5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/focalnet/modeling_focalnet.py
@@ -0,0 +1,958 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FocalNet model."""
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_focalnet import FocalNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    FocalNet encoder's outputs, with potential hidden states.
+    """
+)
+class FocalNetEncoderOutput(ModelOutput):
+    r"""
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    FocalNet model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+class FocalNetModelOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+        Average pooling of the last layer hidden-state.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    FocalNet masked image model outputs.
+    """
+)
+class FocalNetMaskedImageModelingOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
+        Masked image modeling (MLM) loss.
+    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        Reconstructed pixel values.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    reconstruction: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    FocalNet outputs for image classification.
+    """
+)
+class FocalNetImageClassifierOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Classification (or regression if config.num_labels==1) loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Classification (or regression if config.num_labels==1) scores (before SoftMax).
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class FocalNetEmbeddings(nn.Module):
+    """
+    Construct the patch embeddings and layernorm. Optionally, also the mask token.
+    """
+
+    def __init__(self, config, use_mask_token=False):
+        super().__init__()
+
+        self.patch_embeddings = FocalNetPatchEmbeddings(
+            config=config,
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.embed_dim,
+            use_conv_embed=config.use_conv_embed,
+            is_stem=True,
+        )
+        self.patch_grid = self.patch_embeddings.grid_size
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+
+        self.norm = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> tuple[torch.Tensor]:
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        embeddings = self.dropout(embeddings)
+        return embeddings, output_dimensions
+
+
+class FocalNetPatchEmbeddings(nn.Module):
+    def __init__(
+        self,
+        config,
+        image_size,
+        patch_size,
+        num_channels,
+        embed_dim,
+        add_norm=False,
+        use_conv_embed=False,
+        is_stem=False,
+    ):
+        super().__init__()
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+        if use_conv_embed:
+            # if we choose to use conv embedding, then we treat the stem and non-stem differently
+            if is_stem:
+                kernel_size = 7
+                padding = 2
+                stride = 4
+            else:
+                kernel_size = 3
+                padding = 1
+                stride = 2
+            self.projection = nn.Conv2d(
+                num_channels, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+            )
+        else:
+            self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        if add_norm:
+            self.norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        else:
+            self.norm = None
+
+    def maybe_pad(self, pixel_values, height, width):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor, tuple[int]]:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        # pad the input to be divisible by self.patch_size, if needed
+        pixel_values = self.maybe_pad(pixel_values, height, width)
+        embeddings = self.projection(pixel_values)
+        _, _, height, width = embeddings.shape
+        output_dimensions = (height, width)
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        if self.norm is not None:
+            embeddings = self.norm(embeddings)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->FocalNet
+class FocalNetDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class FocalNetModulation(nn.Module):
+    def __init__(self, config, index, dim, focal_factor=2, bias=True, projection_dropout=0.0):
+        super().__init__()
+
+        self.dim = dim
+        self.focal_window = config.focal_windows[index]
+        self.focal_level = config.focal_levels[index]
+        self.focal_factor = focal_factor
+        self.use_post_layernorm_in_modulation = config.use_post_layernorm_in_modulation
+        self.normalize_modulator = config.normalize_modulator
+
+        self.projection_in = nn.Linear(dim, 2 * dim + (self.focal_level + 1), bias=bias)
+        self.projection_context = nn.Conv2d(dim, dim, kernel_size=1, stride=1, bias=bias)
+
+        self.activation = nn.GELU()
+        self.projection_out = nn.Linear(dim, dim)
+        self.projection_dropout = nn.Dropout(projection_dropout)
+        self.focal_layers = nn.ModuleList()
+
+        self.kernel_sizes = []
+        for k in range(self.focal_level):
+            kernel_size = self.focal_factor * k + self.focal_window
+            self.focal_layers.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        dim, dim, kernel_size=kernel_size, stride=1, groups=dim, padding=kernel_size // 2, bias=False
+                    ),
+                    nn.GELU(),
+                )
+            )
+            self.kernel_sizes.append(kernel_size)
+        if self.use_post_layernorm_in_modulation:
+            self.layernorm = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_state):
+        """
+        Args:
+            hidden_state:
+                Input features with shape of (batch_size, height, width, num_channels)
+        """
+        num_channels = hidden_state.shape[-1]
+
+        # pre linear projection
+        x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
+        q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
+
+        # context aggregation
+        ctx_all = 0
+        for level in range(self.focal_level):
+            ctx = self.focal_layers[level](ctx)
+            ctx_all = ctx_all + ctx * gates[:, level : level + 1]
+        ctx_global = self.activation(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
+        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level :]
+
+        # normalize context
+        if self.normalize_modulator:
+            ctx_all = ctx_all / (self.focal_level + 1)
+
+        # focal modulation
+        modulator = self.projection_context(ctx_all)
+        x_out = q * modulator
+        x_out = x_out.permute(0, 2, 3, 1).contiguous()
+        if self.use_post_layernorm_in_modulation:
+            x_out = self.layernorm(x_out)
+
+        # post linear projection
+        x_out = self.projection_out(x_out)
+        x_out = self.projection_dropout(x_out)
+        return x_out
+
+
+class FocalNetMlp(nn.Module):
+    def __init__(self, config, in_features, hidden_features=None, out_features=None, drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.activation = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, hidden_state):
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.drop(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        hidden_state = self.drop(hidden_state)
+        return hidden_state
+
+
+class FocalNetLayer(nn.Module):
+    r"""Focal Modulation Network layer (block).
+
+    Args:
+        config (`FocalNetConfig`):
+            Model config.
+        index (`int`):
+            Layer index.
+        dim (`int`):
+            Number of input channels.
+        input_resolution (`tuple[int]`):
+            Input resolution.
+        drop_path (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate.
+    """
+
+    def __init__(self, config, index, dim, input_resolution, drop_path=0.0):
+        super().__init__()
+
+        self.config = config
+
+        # layer-specific attributes
+        self.dim = dim
+        self.input_resolution = input_resolution
+
+        # general attributes
+        self.drop = config.hidden_dropout_prob
+        self.use_post_layernorm = config.use_post_layernorm
+
+        self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.modulation = FocalNetModulation(
+            config=config,
+            index=index,
+            dim=dim,
+            projection_dropout=self.drop,
+        )
+
+        self.drop_path = FocalNetDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        mlp_hidden_dim = int(dim * config.mlp_ratio)
+        self.mlp = FocalNetMlp(config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=self.drop)
+
+        self.gamma_1 = 1.0
+        self.gamma_2 = 1.0
+        if config.use_layerscale:
+            self.gamma_1 = nn.Parameter(config.layerscale_value * torch.ones(dim), requires_grad=True)
+            self.gamma_2 = nn.Parameter(config.layerscale_value * torch.ones(dim), requires_grad=True)
+
+    def forward(self, hidden_state, input_dimensions):
+        height, width = input_dimensions
+        batch_size, _, num_channels = hidden_state.shape
+        shortcut = hidden_state
+
+        # Focal Modulation
+        hidden_state = hidden_state if self.use_post_layernorm else self.norm1(hidden_state)
+        hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+        hidden_state = self.modulation(hidden_state).view(batch_size, height * width, num_channels)
+        hidden_state = hidden_state if not self.use_post_layernorm else self.norm1(hidden_state)
+
+        # FFN
+        hidden_state = shortcut + self.drop_path(self.gamma_1 * hidden_state)
+        hidden_state = hidden_state + self.drop_path(
+            self.gamma_2
+            * (self.norm2(self.mlp(hidden_state)) if self.use_post_layernorm else self.mlp(self.norm2(hidden_state)))
+        )
+
+        return hidden_state
+
+
+class FocalNetStage(GradientCheckpointingLayer):
+    def __init__(self, config, index, input_resolution):
+        super().__init__()
+
+        self.config = config
+        self.num_stages = len(config.depths)
+
+        embed_dim = [config.embed_dim * (2**i) for i in range(self.num_stages)]
+        dim = embed_dim[index]
+        out_dim = embed_dim[index + 1] if (index < self.num_stages - 1) else None
+        downsample = FocalNetPatchEmbeddings if (index < self.num_stages - 1) else None
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+        drop_path = dpr[sum(config.depths[:index]) : sum(config.depths[: index + 1])]
+
+        self.layers = nn.ModuleList(
+            [
+                FocalNetLayer(
+                    config=config,
+                    index=index,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                )
+                for i in range(config.depths[index])
+            ]
+        )
+
+        if downsample is not None:
+            self.downsample = downsample(
+                config=config,
+                image_size=input_resolution,
+                patch_size=2,
+                num_channels=dim,
+                embed_dim=out_dim,
+                add_norm=True,
+                use_conv_embed=config.use_conv_embed,
+                is_stem=False,
+            )
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(self, hidden_states: torch.Tensor, input_dimensions: tuple[int, int]) -> tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states, input_dimensions)
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height, width = input_dimensions
+            hidden_states = hidden_states.transpose(1, 2).reshape(
+                hidden_states_before_downsampling.shape[0], -1, height, width
+            )
+            hidden_states, output_dimensions = self.downsample(hidden_states)
+
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+        return stage_outputs
+
+
+class FocalNetEncoder(nn.Module):
+    def __init__(self, config, grid_size):
+        super().__init__()
+        self.num_stages = len(config.depths)
+        self.config = config
+
+        self.stages = nn.ModuleList(
+            [
+                FocalNetStage(
+                    config=config,
+                    index=i_layer,
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                )
+                for i_layer in range(self.num_stages)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, FocalNetEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, stage_module in enumerate(self.stages):
+            stage_outputs = stage_module(hidden_states, input_dimensions)
+
+            hidden_states = stage_outputs[0]
+            hidden_states_before_downsampling = stage_outputs[1]
+            output_dimensions = stage_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange b (h w) c -> b c h w
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return FocalNetEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+@auto_docstring
+class FocalNetPreTrainedModel(PreTrainedModel):
+    config: FocalNetConfig
+    base_model_prefix = "focalnet"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FocalNetStage"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, FocalNetEmbeddings):
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+        elif isinstance(module, FocalNetLayer):
+            if self.config.use_layerscale:
+                module.gamma_1.data.fill_(self.config.layerscale_value)
+                module.gamma_2.data.fill_(self.config.layerscale_value)
+
+
+@auto_docstring
+class FocalNetModel(FocalNetPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        """
+        super().__init__(config)
+        self.config = config
+        self.num_stages = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))
+
+        self.embeddings = FocalNetEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = FocalNetEncoder(config, self.embeddings.patch_grid)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FocalNetModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return FocalNetModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    FocalNet Model with a decoder on top for masked image modeling.
+
+    This follows the same implementation as in [SimMIM](https://huggingface.co/papers/2111.09886).
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+    """
+)
+class FocalNetForMaskedImageModeling(FocalNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.focalnet = FocalNetModel(config, add_pooling_layer=False, use_mask_token=True)
+
+        self.num_stages = len(config.depths)
+        num_features = int(config.embed_dim * 2 ** (self.num_stages - 1))
+        self.decoder = nn.Sequential(
+            nn.Conv2d(
+                in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
+            ),
+            nn.PixelShuffle(config.encoder_stride),
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FocalNetMaskedImageModelingOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
+        >>> config = FocalNetConfig()
+        >>> model = FocalNetForMaskedImageModeling(config)
+
+        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+        >>> # create random boolean mask of shape (batch_size, num_patches)
+        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+
+        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> list(reconstructed_pixel_values.shape)
+        [1, 3, 192, 192]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.focalnet(
+            pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        # Reshape to (batch_size, num_channels, height, width)
+        sequence_output = sequence_output.transpose(1, 2)
+        batch_size, num_channels, sequence_length = sequence_output.shape
+        height = width = math.floor(sequence_length**0.5)
+        sequence_output = sequence_output.reshape(batch_size, num_channels, height, width)
+
+        # Reconstruct pixel values
+        reconstructed_pixel_values = self.decoder(sequence_output)
+
+        masked_im_loss = None
+        if bool_masked_pos is not None:
+            size = self.config.image_size // self.config.patch_size
+            bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
+            mask = (
+                bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
+                .repeat_interleave(self.config.patch_size, 2)
+                .unsqueeze(1)
+                .contiguous()
+            )
+            reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
+            masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+
+        if not return_dict:
+            output = (reconstructed_pixel_values,) + outputs[2:]
+            return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+        return FocalNetMaskedImageModelingOutput(
+            loss=masked_im_loss,
+            reconstruction=reconstructed_pixel_values,
+            hidden_states=outputs.hidden_states,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
+    ImageNet.
+    """
+)
+class FocalNetForImageClassification(FocalNetPreTrainedModel):
+    # Copied from transformers.models.swin.modeling_swin.SwinForImageClassification.__init__ with Swin->FocalNet, swin->focalnet
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.focalnet = FocalNetModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.focalnet.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FocalNetImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.focalnet(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return FocalNetImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    FocalNet backbone, to be used with frameworks like X-Decoder.
+    """
+)
+class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
+    has_attentions = False
+
+    def __init__(self, config: FocalNetConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.embed_dim] + config.hidden_sizes
+        self.focalnet = FocalNetModel(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
+        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.focalnet(pixel_values, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+__all__ = [
+    "FocalNetForImageClassification",
+    "FocalNetForMaskedImageModeling",
+    "FocalNetBackbone",
+    "FocalNetModel",
+    "FocalNetPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/__init__.py b/phivenv/Lib/site-packages/transformers/models/fsmt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8f31762d681dbf3541d38c39fafdf5fa6b864d1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fsmt/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_fsmt import *
+    from .modeling_fsmt import *
+    from .tokenization_fsmt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1fb5359331eddf7caf976704b249aa41ec3540f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63721bae79a14819d7636c94ff7cd35e174cf078
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..222480456b3bcaabd85aab09278d3469ce0a1724
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eee697aa0c6f44023d4ea72a661d8d8e7f74fc26
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/configuration_fsmt.py b/phivenv/Lib/site-packages/transformers/models/fsmt/configuration_fsmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aec2662293f8f1d6d17fa90e73367769ce95461
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fsmt/configuration_fsmt.py
@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FSMT configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DecoderConfig(PretrainedConfig):
+    r"""
+    Configuration class for FSMT's decoder specific things. note: this is a private helper class
+    """
+
+    model_type = "fsmt_decoder"
+
+    def __init__(self, vocab_size=0, bos_token_id=0, is_encoder_decoder=True, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.bos_token_id = bos_token_id
+        self.is_encoder_decoder = is_encoder_decoder
+
+
+class FSMTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FSMTModel`]. It is used to instantiate a FSMT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the FSMT
+    [facebook/wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        langs (`list[str]`):
+            A list with source language and target_language (e.g., ['en', 'ru']).
+        src_vocab_size (`int`):
+            Vocabulary size of the encoder. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed to the forward method in the encoder.
+        tgt_vocab_size (`int`):
+            Vocabulary size of the decoder. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed to the forward method in the decoder.
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `Callable`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(d_model).
+        bos_token_id (`int`, *optional*, defaults to 0)
+            Beginning of stream token id.
+        pad_token_id (`int`, *optional*, defaults to 1)
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 2)
+            End of stream token id.
+        decoder_start_token_id (`int`, *optional*):
+            This model starts decoding with `eos_token_id`
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            Google "layerdrop arxiv", as its not explainable in one line.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            Google "layerdrop arxiv", as its not explainable in one line.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether this is an encoder/decoder model.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        num_beams (`int`, *optional*, defaults to 5)
+            Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
+            no beam search.
+        length_penalty (`float`, *optional*, defaults to 1)
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
+        early_stopping (`bool`, *optional*, defaults to `False`)
+            Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
+            when at least `num_beams` sentences are finished per batch or not.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+
+    Examples:
+
+    ```python
+    >>> from transformers import FSMTConfig, FSMTModel
+
+    >>> # Initializing a FSMT facebook/wmt19-en-ru style configuration
+    >>> config = FSMTConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = FSMTModel(config)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "fsmt"
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    sub_configs = {"decoder": DecoderConfig}
+
+    # update the defaults from config file
+    def __init__(
+        self,
+        langs=["en", "de"],
+        src_vocab_size=42024,
+        tgt_vocab_size=42024,
+        activation_function="relu",
+        d_model=1024,
+        max_length=200,
+        max_position_embeddings=1024,
+        encoder_ffn_dim=4096,
+        encoder_layers=12,
+        encoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_ffn_dim=4096,
+        decoder_layers=12,
+        decoder_attention_heads=16,
+        decoder_layerdrop=0.0,
+        attention_dropout=0.0,
+        dropout=0.1,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        is_encoder_decoder=True,
+        scale_embedding=True,
+        tie_word_embeddings=False,
+        num_beams=5,
+        length_penalty=1.0,
+        early_stopping=False,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        forced_eos_token_id=2,
+        **common_kwargs,
+    ):
+        self.langs = langs
+        self.src_vocab_size = src_vocab_size
+        self.tgt_vocab_size = tgt_vocab_size
+        self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
+
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = self.num_hidden_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.init_std = init_std  # Normal(0, this parameter)
+        self.activation_function = activation_function
+
+        self.decoder = DecoderConfig(
+            vocab_size=tgt_vocab_size,
+            bos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            num_hidden_layers=encoder_layers,
+        )
+        if "decoder" in common_kwargs:
+            del common_kwargs["decoder"]
+
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        # 3 Types of Dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.dropout = dropout
+
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            tie_word_embeddings=tie_word_embeddings,
+            forced_eos_token_id=forced_eos_token_id,
+            max_length=max_length,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            early_stopping=early_stopping,
+            **common_kwargs,
+        )
+
+
+__all__ = ["FSMTConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/modeling_fsmt.py b/phivenv/Lib/site-packages/transformers/models/fsmt/modeling_fsmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb7e1e15be338038fba799c0b03217d51c5c86a6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fsmt/modeling_fsmt.py
@@ -0,0 +1,1257 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Original implementation: https://github.com/pytorch/fairseq/tree/master/examples/wmt19
+# Authors:
+# - @alexeib Alexei Baevski
+# - @edunov Sergey Edunov
+# - @michaelauli Michael Auli
+# - @myleott Myle Ott
+# - @nng555 Nathan Ng
+# - David Grangier
+# - Kyra Yee
+#
+# Paper: Facebook FAIR's WMT19 News Translation Task Submission https://huggingface.co/papers/1907.06616
+#
+"""PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19"""
+
+import math
+from typing import Any, Optional, Union
+
+import torch
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from .configuration_fsmt import FSMTConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# See all FSMT models at https://huggingface.co/models?filter=fsmt
+
+# Porting notes:
+# this one is modeled after BartModel*
+#
+# Currently only translation (fairseq also has weights for LM)
+#
+# fairseq provides weights for ru-en, en-ru and de-en, en-de pairs. All have been ported.
+# - ru-en, en-ru use asymmetric vocab
+# - de-en, en-de use a merged single vocab (but the code works as if they are separate)
+#
+# Differences with Bart:
+# - not using bos token
+# - 2 separate vocabs (src and target)
+# - embed weights aren't tied
+# - uses a model Ensemble (but that part isn't ported/implemented yet) - so we
+#   aren't getting as good of a BLEU score
+# - uses a projection layer at the end of the decoder
+# - doesn't use final_logits_bias
+# - beam search: stops as soon as num_beams == len(hypos) (whereas transformers
+#   is not satisfied there and will continue searching until the next cycles
+#   aren't promising something better), comparing BLEU scores - the transformers
+#   algorithm is slightly superior, therefore using the latter. But if you want
+#   to match fairseq outputs, you need to pass ``early_stopping=True`` to ``generate()``.
+#
+# SinusoidalPositionalEmbedding is slightly different from Bart's - generates
+# different embeddings. This implementation is copied verbatim from fairseq with
+# some small changes to make it work here.
+#
+# Other changes:
+#  - doesn't support use_cache as Bart's version does
+#
+#
+# FSMTConfig changes with BartConfig
+#
+#    Differences with BART:
+#    - src/tgt vocabs aren't shared
+#    - token embeddings aren't shared
+#    - needs a language pair
+#    - scale_embedding are True
+#
+#    some unused args were removed too
+#
+#
+# TODO:
+# - port model ensemble (fs uses 4 model checkpoints)
+# - solve beam search discrepancies
+# docstyle-ignore
+
+"""
+
+Here is how to compare BLEU scores against fairseq implementation:
+(don't forget to install sacrebleu: `pip install sacrebleu`)
+
+# en-ru
+
+export PAIR=en-ru
+export DATA_DIR=data/$PAIR
+export SAVE_DIR=data/$PAIR
+export BS=8
+export NUM_BEAMS=50
+mkdir -p $DATA_DIR
+sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
+sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
+echo $PAIR
+PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
+
+# (fairseq BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605)
+
+
+# ru-en
+
+export PAIR=ru-en
+export DATA_DIR=data/$PAIR
+export SAVE_DIR=data/$PAIR
+export BS=8
+export NUM_BEAMS=50
+mkdir -p $DATA_DIR
+sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
+sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
+PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
+
+
+# (fairseq BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937)
+
+
+# de-en
+
+export PAIR=de-en
+export DATA_DIR=data/$PAIR
+export SAVE_DIR=data/$PAIR
+export BS=8
+export NUM_BEAMS=50
+mkdir -p $DATA_DIR
+sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
+sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
+echo $PAIR
+PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
+
+# (fairseq BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750)
+
+
+
+# en-de
+
+export PAIR=en-de
+export DATA_DIR=data/$PAIR
+export SAVE_DIR=data/$PAIR
+export BS=8
+mkdir -p $DATA_DIR
+sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
+sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
+echo $PAIR
+PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
+
+# (fairseq BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
+
+"""
+
+
+def invert_mask(attention_mask):
+    """Turns 1->0, 0->1, False->True, True-> False"""
+    assert attention_mask.dim() == 2
+    return attention_mask.eq(0)
+
+
+def triu_onnx(x, diagonal=0):
+    l = x.shape[0]
+    arange = torch.arange(l, device=x.device)
+    mask = arange.expand(l, l)
+    arange = arange.unsqueeze(-1)
+    if diagonal:
+        arange = arange + diagonal
+    mask = mask >= arange
+    return x.masked_fill(mask == 0, 0)
+
+
+def _prepare_fsmt_decoder_inputs(
+    config,
+    input_ids,
+    decoder_input_ids=None,
+    decoder_padding_mask=None,
+    causal_mask_dtype=torch.float32,
+):
+    """
+    Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
+    This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
+    generation
+    """
+    pad_token_id = config.pad_token_id
+    if decoder_input_ids is None:
+        decoder_input_ids = shift_tokens_right(input_ids, pad_token_id)
+    bsz, tgt_len = decoder_input_ids.size()
+    if decoder_padding_mask is None:
+        decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id)
+    else:
+        decoder_padding_mask = invert_mask(decoder_padding_mask)
+    causal_mask = triu_onnx(fill_with_neg_inf(torch.zeros(tgt_len, tgt_len, dtype=causal_mask_dtype)), 1).to(
+        device=decoder_input_ids.device
+    )
+    return decoder_input_ids, decoder_padding_mask, causal_mask
+
+
+@auto_docstring
+class PretrainedFSMTModel(PreTrainedModel):
+    config: FSMTConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, SinusoidalPositionalEmbedding):
+            weight = module.get_embedding(*module.weight.shape, module.padding_idx)
+            weight = nn.Parameter(weight, requires_grad=False)
+            weight.detach_()
+            module.weight = weight
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+def _make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+# Helper Functions, mostly for making masks
+def _check_shapes(shape_1, shape2):
+    if shape_1 != shape2:
+        raise AssertionError(f"shape mismatch: {shape_1} != {shape2}")
+
+
+def shift_tokens_right(input_ids, pad_token_id):
+    """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
+
+    # replace possible -100 values in labels by `pad_token_id`
+    input_ids.masked_fill_(input_ids == -100, pad_token_id)
+
+    prev_output_tokens = input_ids.clone()
+    index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
+    prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
+    prev_output_tokens[:, 1:] = input_ids[:, :-1]
+    return prev_output_tokens
+
+
+def make_padding_mask(input_ids, padding_idx=1):
+    """True for pad tokens"""
+    padding_mask = input_ids.eq(padding_idx)
+    if not padding_mask.any():
+        padding_mask = None
+    return padding_mask
+
+
+# Helper Modules
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, config: FSMTConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = Attention(self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def forward(self, x, encoder_padding_mask, layer_head_mask, output_attentions=False):
+        """
+        Args:
+            x (`torch.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_padding_mask (`torch.ByteTensor`): binary ByteTensor of shape
+                *(batch, src_len)* where padding elements are indicated by `1`.
+            for t_tgt, t_src is excluded (or masked out), =0 means it is
+            included in attention
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                *(config.encoder_attention_heads,)*.
+
+        Returns:
+            encoded output of shape *(seq_len, batch, embed_dim)*
+        """
+        residual = x
+        x, attn_weights = self.self_attn(
+            query=x,
+            key=x,
+            key_padding_mask=encoder_padding_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.self_attn_layer_norm(x)
+
+        residual = x
+        x = self.activation_fn(self.fc1(x))
+        x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.final_layer_norm(x)
+        return x, attn_weights
+
+
+class FSMTEncoder(nn.Module):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`EncoderLayer`].
+
+    Args:
+        config: FSMTConfig
+    """
+
+    def __init__(self, config: FSMTConfig, embed_tokens):
+        super().__init__()
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.padding_idx = embed_tokens.padding_idx
+        self.embed_tokens = embed_tokens
+        embed_dim = embed_tokens.embedding_dim
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.embed_positions = SinusoidalPositionalEmbedding(
+            config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
+        )
+        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])  # type: list[EncoderLayer]
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            input_ids (`torch.LongTensor`): tokens in the source language of shape
+                *(batch, src_len)*
+            attention_mask (`torch.LongTensor`): indicating which indices are padding tokens
+            inputs_embeds (`torch.FloatTensor`):
+                embedding vectors of shape *(batch, src_len, embed_dim)*
+            head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        Returns:
+            BaseModelOutput or Tuple comprised of:
+
+                - **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
+                - **encoder_states** (`Tuple(torch.FloatTensor)`): all intermediate hidden states of shape *(src_len,
+                  batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
+                - **all_attentions** (`Tuple(torch.FloatTensor)`): Attention weights for each layer.
+                During training might not be of length n_layers because of layer dropout.
+        """
+        # check attention mask and invert
+        if attention_mask is not None:
+            attention_mask = invert_mask(attention_mask)
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+            embed_pos = self.embed_positions(input_ids)
+        elif inputs_embeds is not None:
+            inputs_embeds = inputs_embeds * self.embed_scale
+
+            # We assume zeros hidden states correspond to padding tokens
+            # and create `position_ids` where inputs_embeds[:, :, 0] == 0
+            position_ids = inputs_embeds[:, :, 0].masked_fill(
+                inputs_embeds[:, :, 0].eq(0), self.embed_positions.padding_idx
+            )
+
+            embed_pos = self.embed_positions(position_ids)
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        x = inputs_embeds + embed_pos
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                x = x.transpose(0, 1)  # T x B x C -> B x T x C
+                encoder_states += (x,)
+                x = x.transpose(0, 1)  # B x T x C -> T x B x C
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                attn = None
+            else:
+                x, attn = encoder_layer(
+                    x,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                )
+
+            if output_attentions:
+                all_attentions = all_attentions + (attn,)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if output_hidden_states:
+            encoder_states += (x,)
+
+        if not return_dict:
+            return tuple(v for v in [x, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, config: FSMTConfig, layer_idx=None):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = Attention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.encoder_attn = Attention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            encoder_decoder_attention=True,
+            layer_idx=layer_idx,
+        )
+        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        x,
+        encoder_hidden_states,
+        encoder_attn_mask=None,
+        layer_state=None,
+        causal_mask=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        decoder_padding_mask=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        residual = x
+
+        # Self Attention
+        x, self_attn_weights = self.self_attn(
+            query=x,
+            key=x,
+            layer_state=layer_state,  # adds keys to layer state
+            key_padding_mask=decoder_padding_mask,
+            attn_mask=causal_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.self_attn_layer_norm(x)
+
+        # Cross attention
+        residual = x
+        assert self.encoder_attn.cache_key != self.self_attn.cache_key
+        x, cross_attn_weights = self.encoder_attn(
+            query=x,
+            key=encoder_hidden_states,
+            key_padding_mask=encoder_attn_mask,
+            layer_state=layer_state,  # mutates layer state
+            layer_head_mask=cross_attn_layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.encoder_attn_layer_norm(x)
+
+        # Fully Connected
+        residual = x
+        x = self.activation_fn(self.fc1(x))
+        x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.final_layer_norm(x)
+        return (
+            x,
+            self_attn_weights,
+            cross_attn_weights,
+        )
+
+
+class FSMTDecoder(nn.Module):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]
+
+    Args:
+        config: FSMTConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: FSMTConfig, embed_tokens: nn.Embedding):
+        super().__init__()
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = embed_tokens.padding_idx
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_tokens = embed_tokens
+        embed_dim = embed_tokens.embedding_dim
+        self.embed_positions = SinusoidalPositionalEmbedding(
+            config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
+        )
+        self.layers = nn.ModuleList([DecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)])  # type: list[DecoderLayer]
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.embed_tokens.weight, modifier_rank=None):
+                embed_tokens_weight_shape = self.embed_tokens.weight.shape
+        else:
+            embed_tokens_weight_shape = self.embed_tokens.weight.shape
+        self.output_projection = nn.Linear(embed_tokens_weight_shape[1], embed_tokens_weight_shape[0], bias=False)
+        self.output_projection.weight = self.embed_tokens.weight
+
+    def _tie_weights(self):
+        self.embed_tokens.weight = self.output_projection.weight
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_padding_mask: torch.Tensor,
+        decoder_padding_mask: torch.Tensor,
+        decoder_causal_mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """
+        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
+        EMNLP 2019).
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch, tgt_len)`):
+                previous decoder outputs for teacher forcing
+            encoder_hidden_states: output from the encoder, used for
+                encoder-side attention
+            encoder_padding_mask: for ignoring pad tokens
+            past_key_values (dict or None): dictionary used for storing state during generation
+            head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        Returns:
+            BaseModelOutputWithPast or tuple:
+
+                - the decoder's features of shape *(batch, tgt_len, embed_dim)*
+                - the cache
+                - hidden states
+                - attentions
+        """
+        # check attention mask and invert
+        if encoder_padding_mask is not None:
+            encoder_padding_mask = invert_mask(encoder_padding_mask)
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            # embed positions
+            positions = self.embed_positions(input_ids)
+            if use_cache:
+                input_ids = input_ids[:, -1:]
+                positions = positions[:, -1:]  # happens after we embed them
+            x = self.embed_tokens(input_ids) * self.embed_scale
+        elif inputs_embeds is not None:
+            # We assume zeros hidden states correspond to padding tokens
+            # and create `position_ids` where inputs_embeds[:, :, 0] == 0
+            position_ids = inputs_embeds[:, :, 0].masked_fill(
+                inputs_embeds[:, :, 0].eq(0), self.embed_positions.padding_idx
+            )
+            positions = self.embed_positions(position_ids)
+            x = inputs_embeds * self.embed_scale
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        x += positions
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+
+        # Convert to FSMT output format: (BS, seq_len, model_dim) -> (seq_len, BS, model_dim)
+        x = x.transpose(0, 1)
+        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attns = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                x = x.transpose(0, 1)
+                all_hidden_states += (x,)
+                x = x.transpose(0, 1)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            x, layer_self_attn, layer_cross_attn = decoder_layer(
+                x,
+                encoder_hidden_states,
+                encoder_attn_mask=encoder_padding_mask,
+                decoder_padding_mask=decoder_padding_mask,
+                layer_state=past_key_values,
+                causal_mask=decoder_causal_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            if output_attentions:
+                all_self_attns += (layer_self_attn,)
+                all_cross_attns += (layer_cross_attn,)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            x = x.transpose(0, 1)
+            all_hidden_states += (x,)
+            x = x.transpose(0, 1)
+
+        # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
+        x = x.transpose(0, 1)
+        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+
+        x = self.output_projection(x)
+
+        if not return_dict:
+            return tuple(
+                v for v in [x, past_key_values, all_hidden_states, all_self_attns, all_cross_attns] if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=x,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attns,
+        )
+
+
+def _reorder_buffer(attn_cache, new_order):
+    for k, input_buffer_k in attn_cache.items():
+        if input_buffer_k is not None:
+            attn_cache[k] = input_buffer_k.index_select(0, new_order)
+    return attn_cache
+
+
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        encoder_decoder_attention=False,  # otherwise self_attention
+        layer_idx=None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+        self.layer_idx = layer_idx
+
+        self.encoder_decoder_attention = encoder_decoder_attention
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"
+
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        layer_state: Optional[Cache] = None,
+        attn_mask: Optional[Tensor] = None,
+        layer_head_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time(SeqLen) x Batch x Channel"""
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+
+        if layer_state is not None:
+            if isinstance(layer_state, EncoderDecoderCache):
+                is_updated = layer_state.is_updated.get(self.layer_idx)
+                if self.encoder_decoder_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = layer_state.cross_attention_cache
+                else:
+                    curr_past_key_value = layer_state.self_attention_cache
+            else:
+                curr_past_key_value = layer_state
+
+        # NOTE: FSMT has format (seq_len, BS, model_dim) ofr inputs
+        current_states = key if self.encoder_decoder_attention else query
+        if self.encoder_decoder_attention and layer_state is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(-1, bsz, self.num_heads, self.head_dim).permute(1, 2, 0, 3)
+            value_states = value_states.view(-1, bsz, self.num_heads, self.head_dim).permute(1, 2, 0, 3)
+
+            if layer_state is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not self.encoder_decoder_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if self.encoder_decoder_attention:
+                    layer_state.is_updated[self.layer_idx] = True
+
+        query_states = self.q_proj(query) * self.scaling
+
+        # Reshape back to 3D tensors for `bmm`
+        query_states = query_states.view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        key_states = key_states.reshape(bsz * self.num_heads, -1, self.head_dim)
+        value_states = value_states.reshape(bsz * self.num_heads, -1, self.head_dim)
+
+        assert key_states is not None
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len)
+
+        if attn_mask is not None:
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        assert key_padding_mask is None or key_padding_mask.size()[:2] == (
+            bsz,
+            src_len,
+        )
+
+        if key_padding_mask is not None:  # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)
+            attn_weights = attn_weights.masked_fill(reshaped, torch.finfo(attn_weights.dtype).min)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # make sure that attn_weights are included in graph
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(
+            attn_weights,
+            p=self.dropout,
+            training=self.training,
+        )
+
+        assert value_states is not None
+        attn_output = torch.bmm(attn_probs, value_states)
+        assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+def fill_with_neg_inf(t):
+    """FP16-compatible function that fills a input_ids with -inf."""
+    return t.float().fill_(torch.finfo(t.dtype).min).type_as(t)
+
+
+# Public API
+def _get_shape(t):
+    return getattr(t, "shape", None)
+
+
+@auto_docstring
+class FSMTModel(PretrainedFSMTModel):
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]
+
+    def __init__(self, config: FSMTConfig):
+        super().__init__(config)
+
+        padding_idx = config.pad_token_id
+        encoder_embed_tokens = nn.Embedding(config.src_vocab_size, config.d_model, padding_idx)
+        decoder_embed_tokens = nn.Embedding(config.tgt_vocab_size, config.d_model, padding_idx)
+
+        self.encoder = FSMTEncoder(config, encoder_embed_tokens)
+        self.decoder = FSMTDecoder(config, decoder_embed_tokens)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
+            self._tie_or_clone_weights(self.decoder.output_projection, self.get_input_embeddings())
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[tuple[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        """
+        if decoder_input_ids is None:
+            use_cache = False
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # make masks if user doesn't supply
+        if not use_cache and input_ids is not None:
+            decoder_input_ids, decoder_padding_mask, causal_mask = _prepare_fsmt_decoder_inputs(
+                self.config,
+                input_ids,
+                decoder_input_ids=decoder_input_ids,
+                decoder_padding_mask=decoder_attention_mask,
+                causal_mask_dtype=self.decoder.embed_tokens.weight.dtype,
+            )
+        else:
+            decoder_padding_mask, causal_mask = None, None
+
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            raise ValueError("Make sure that `decoder_input_ids` or `decoder_inputs_embeds` are passed.")
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            decoder_input_ids,
+            encoder_outputs[0],
+            attention_mask,
+            decoder_padding_mask,
+            decoder_causal_mask=causal_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.encoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.encoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_output_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+
+@auto_docstring(
+    custom_intro="""
+    The FSMT Model with a language modeling head. Can be used for summarization.
+    """
+)
+class FSMTForConditionalGeneration(PretrainedFSMTModel, GenerationMixin):
+    base_model_prefix = "model"
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]
+
+    def __init__(self, config: FSMTConfig):
+        super().__init__(config)
+        base_model = FSMTModel(config)
+        self.model = base_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example Translation:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FSMTForConditionalGeneration
+
+        >>> mname = "facebook/wmt19-ru-en"
+        >>> model = FSMTForConditionalGeneration.from_pretrained(mname)
+        >>> tokenizer = AutoTokenizer.from_pretrained(mname)
+
+        >>> src_text = "Машинное обучение - это здорово, не так ли?"
+        >>> input_ids = tokenizer(src_text, return_tensors="pt").input_ids
+        >>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
+        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+        "Machine learning is great, isn't it?"
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        lm_logits = outputs[0]
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # TODO(SS): do we need to ignore pad tokens in labels?
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.tgt_vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id)
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def get_output_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_output_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+
+class SinusoidalPositionalEmbedding(nn.Embedding):
+    """
+    This module produces sinusoidal positional embeddings of any length.
+
+    We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.
+
+    Padding symbols are ignored.
+
+    These embeddings get automatically extended in forward if more positions is needed.
+    """
+
+    def __init__(self, num_positions, embedding_dim, padding_idx):
+        super().__init__(num_positions, embedding_dim, padding_idx)
+
+    def make_weight(self, num_positions, embedding_dim, padding_idx):
+        weight = self.get_embedding(num_positions, embedding_dim, padding_idx)
+        # in forward put the weights on the correct dtype and device of the param
+        weight = weight.to(dtype=self.weight.dtype, device=self.weight.device)
+        self.weight = nn.Parameter(weight)
+        self.weight.detach_()
+        self.weight.requires_grad = False
+
+    @staticmethod
+    def get_embedding(num_embeddings, embedding_dim, padding_idx):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+
+    @staticmethod
+    def make_positions(tensor, padding_idx: int):
+        """
+        Replace non-padding symbols with their position numbers.
+
+        Position numbers begin at padding_idx+1. Padding symbols are ignored.
+        """
+        # The series of casts and type-conversions here are carefully
+        # balanced to both work with ONNX export and XLA. In particular XLA
+        # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+        # how to handle the dtype kwarg in cumsum.
+        mask = tensor.ne(padding_idx).int()
+        return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
+
+    def forward(
+        self,
+        input,
+        incremental_state: Optional[Any] = None,
+        timestep: Optional[Tensor] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input.shape[:2]
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weight.size(0):
+            # expand embeddings if needed
+            self.make_weight(max_pos, self.embedding_dim, self.padding_idx)
+        positions = self.make_positions(input, self.padding_idx)
+        return super().forward(positions)
+
+
+__all__ = ["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fsmt/tokenization_fsmt.py b/phivenv/Lib/site-packages/transformers/models/fsmt/tokenization_fsmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a4446d8e90b4c0466d1c7c09c4dc8b153dc8a33
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fsmt/tokenization_fsmt.py
@@ -0,0 +1,488 @@
+# coding=utf-8
+# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for FSMT."""
+
+import json
+import os
+import re
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "src_vocab_file": "vocab-src.json",
+    "tgt_vocab_file": "vocab-tgt.json",
+    "merges_file": "merges.txt",
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def replace_unicode_punct(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", "1")
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
+    return text
+
+
+def remove_non_printing_char(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    """
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith("C"):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+# Porting notes:
+# this one is modeled after XLMTokenizer
+#
+# added:
+# - src_vocab_file,
+# - tgt_vocab_file,
+# - langs,
+
+
+class FSMTTokenizer(PreTrainedTokenizer):
+    """
+    Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
+
+    - Moses preprocessing and tokenization.
+    - Normalizing all inputs text.
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
+      "__classify__") to a vocabulary.
+    - The argument `langs` defines a pair of languages.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        langs (`List[str]`, *optional*):
+            A list of two languages to translate from and to, for instance `["en", "ru"]`.
+        src_vocab_file (`str`, *optional*):
+            File containing the vocabulary for the source language.
+        tgt_vocab_file (`st`, *optional*):
+            File containing the vocabulary for the target language.
+        merges_file (`str`, *optional*):
+            File containing the merges.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        langs=None,
+        src_vocab_file=None,
+        tgt_vocab_file=None,
+        merges_file=None,
+        do_lower_case=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        **kwargs,
+    ):
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+        self.src_vocab_file = src_vocab_file
+        self.tgt_vocab_file = tgt_vocab_file
+        self.merges_file = merges_file
+        self.do_lower_case = do_lower_case
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = {}
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = {}
+        self.cache_moses_detokenizer = {}
+
+        if langs and len(langs) == 2:
+            self.src_lang, self.tgt_lang = langs
+        else:
+            raise ValueError(
+                f"arg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got {langs}. "
+                "Usually that means that tokenizer can't find a mapping for the given model path "
+                "in  and other maps of this tokenizer."
+            )
+
+        with open(src_vocab_file, encoding="utf-8") as src_vocab_handle:
+            self.encoder = json.load(src_vocab_handle)
+        with open(tgt_vocab_file, encoding="utf-8") as tgt_vocab_handle:
+            tgt_vocab = json.load(tgt_vocab_handle)
+            self.decoder = {v: k for k, v in tgt_vocab.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        super().__init__(
+            langs=langs,
+            src_vocab_file=src_vocab_file,
+            tgt_vocab_file=tgt_vocab_file,
+            merges_file=merges_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+    # hack override
+    def get_vocab(self) -> dict[str, int]:
+        return self.get_src_vocab()
+
+    # hack override
+    @property
+    def vocab_size(self) -> int:
+        return self.src_vocab_size
+
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        return self.cache_moses_punct_normalizer[lang].normalize(text)
+
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        return self.cache_moses_tokenizer[lang].tokenize(
+            text, aggressive_dash_splits=True, return_str=False, escape=True
+        )
+
+    def moses_detokenize(self, tokens, lang):
+        if lang not in self.cache_moses_detokenizer:
+            moses_detokenizer = self.sm.MosesDetokenizer(lang=lang)
+            self.cache_moses_detokenizer[lang] = moses_detokenizer
+        return self.cache_moses_detokenizer[lang].detokenize(tokens)
+
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
+    @property
+    def src_vocab_size(self):
+        return len(self.encoder)
+
+    @property
+    def tgt_vocab_size(self):
+        return len(self.decoder)
+
+    def get_src_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def get_tgt_vocab(self):
+        return dict(self.decoder, **self.added_tokens_decoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
+        """
+        Tokenize a string given language code using Moses.
+
+        Details of tokenization:
+
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+
+        Args:
+            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
+              languages. However, we don't enforce it.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
+
+        Returns:
+            List of tokens.
+        """
+        # ignore `lang` which is currently isn't explicitly passed in tokenization_utils.py and always results in lang=en
+        # if lang != self.src_lang:
+        #     raise ValueError(f"Expected lang={self.src_lang}, but got {lang}")
+        lang = self.src_lang
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        if bypass_tokenizer:
+            text = text.split()
+        else:
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.moses_tokenize(text, lang=lang)
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+
+        # remove BPE
+        tokens = [t.replace(" ", "").replace("</w>", " ") for t in tokens]
+        tokens = "".join(tokens).split()
+        # detokenize
+        text = self.moses_detokenize(tokens, self.tgt_lang)
+        return text
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A FAIRSEQ Transformer sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+
+        # no bos used in fairseq
+        if token_ids_1 is None:
+            return token_ids_0 + sep
+        return token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        # no bos used in fairseq
+        if token_ids_1 is not None:
+            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+
+        src_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["src_vocab_file"]
+        )
+        tgt_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tgt_vocab_file"]
+        )
+        merges_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(src_vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        with open(tgt_vocab_file, "w", encoding="utf-8") as f:
+            tgt_vocab = {v: k for k, v in self.decoder.items()}
+            f.write(json.dumps(tgt_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merges_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return src_vocab_file, tgt_vocab_file, merges_file
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sm"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+
+__all__ = ["FSMTTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/__init__.py b/phivenv/Lib/site-packages/transformers/models/funnel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4e0587ce32f5e59562102b302a113f387c60130
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/funnel/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_funnel import *
+    from .convert_funnel_original_tf_checkpoint_to_pytorch import *
+    from .modeling_funnel import *
+    from .modeling_tf_funnel import *
+    from .tokenization_funnel import *
+    from .tokenization_funnel_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8302baa046f03e4d6eb4e342ef7f11a3d5d26aec
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9655f731aa686b4ae854041cb985527fd35b6d1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e5c1d30fa048582eabc8bb5691b8bb040e6d0d0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f79ebcfa5ce3cd5e4118dcbb4be6a3ae90aafa89
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2daf570dd6715ab90bd1d89f8f3a3407fe14e055
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4be6a05e161021a55180ce95d3f1c414be926df6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/configuration_funnel.py b/phivenv/Lib/site-packages/transformers/models/funnel/configuration_funnel.py
new file mode 100644
index 0000000000000000000000000000000000000000..212a976f2781935811c191dfcd7e0076e59025e8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/funnel/configuration_funnel.py
@@ -0,0 +1,166 @@
+# coding=utf-8
+# Copyright 2020, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Funnel Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FunnelConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FunnelModel`] or a [`TFBertModel`]. It is used to
+    instantiate a Funnel Transformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Funnel
+    Transformer [funnel-transformer/small](https://huggingface.co/funnel-transformer/small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Funnel transformer. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`FunnelModel`] or [`TFFunnelModel`].
+        block_sizes (`list[int]`, *optional*, defaults to `[4, 4, 4]`):
+            The sizes of the blocks used in the model.
+        block_repeats (`list[int]`, *optional*):
+            If passed along, each layer of each block is repeated the number of times indicated.
+        num_decoder_layers (`int`, *optional*, defaults to 2):
+            The number of layers in the decoder (when not using the base model).
+        d_model (`int`, *optional*, defaults to 768):
+            Dimensionality of the model's hidden states.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (`int`, *optional*, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (`int`, *optional*, defaults to 3072):
+            Inner dimension in the feed-forward blocks.
+        hidden_act (`str` or `callable`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability used between the two layers of the feed-forward blocks.
+        initializer_range (`float`, *optional*, defaults to 0.1):
+            The upper bound of the *uniform initializer* for initializing all weight matrices in attention layers.
+        initializer_std (`float`, *optional*):
+            The standard deviation of the *normal initializer* for initializing the embedding matrix and the weight of
+            linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
+            linear layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-09):
+            The epsilon used by the layer normalization layers.
+        pooling_type (`str`, *optional*, defaults to `"mean"`):
+            Possible values are `"mean"` or `"max"`. The way pooling is performed at the beginning of each block.
+        attention_type (`str`, *optional*, defaults to `"relative_shift"`):
+            Possible values are `"relative_shift"` or `"factorized"`. The former is faster on CPU/GPU while the latter
+            is faster on TPU.
+        separate_cls (`bool`, *optional*, defaults to `True`):
+            Whether or not to separate the cls token when applying pooling.
+        truncate_seq (`bool`, *optional*, defaults to `True`):
+            When using `separate_cls`, whether or not to truncate the last token when pooling, to avoid getting a
+            sequence length that is not a multiple of 2.
+        pool_q_only (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
+    """
+
+    model_type = "funnel"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_head",
+    }
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        block_sizes=[4, 4, 4],
+        block_repeats=None,
+        num_decoder_layers=2,
+        d_model=768,
+        n_head=12,
+        d_head=64,
+        d_inner=3072,
+        hidden_act="gelu_new",
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        initializer_range=0.1,
+        initializer_std=None,
+        layer_norm_eps=1e-9,
+        pooling_type="mean",
+        attention_type="relative_shift",
+        separate_cls=True,
+        truncate_seq=True,
+        pool_q_only=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.block_sizes = block_sizes
+        self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
+        assert len(block_sizes) == len(self.block_repeats), (
+            "`block_sizes` and `block_repeats` should have the same length."
+        )
+        self.num_decoder_layers = num_decoder_layers
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.initializer_range = initializer_range
+        self.initializer_std = initializer_std
+        self.layer_norm_eps = layer_norm_eps
+        assert pooling_type in [
+            "mean",
+            "max",
+        ], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported."
+        self.pooling_type = pooling_type
+        assert attention_type in [
+            "relative_shift",
+            "factorized",
+        ], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported."
+        self.attention_type = attention_type
+        self.separate_cls = separate_cls
+        self.truncate_seq = truncate_seq
+        self.pool_q_only = pool_q_only
+
+        super().__init__(**kwargs)
+
+    @property
+    def num_hidden_layers(self):
+        return sum(self.block_sizes)
+
+    @num_hidden_layers.setter
+    def num_hidden_layers(self, value):
+        raise NotImplementedError(
+            "This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`."
+        )
+
+    @property
+    def num_blocks(self):
+        return len(self.block_sizes)
+
+    @num_blocks.setter
+    def num_blocks(self, value):
+        raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.")
+
+
+__all__ = ["FunnelConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/modeling_funnel.py b/phivenv/Lib/site-packages/transformers/models/funnel/modeling_funnel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4370344cccfb19c710ed05b01e72f6880b54afdd
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/funnel/modeling_funnel.py
@@ -0,0 +1,1452 @@
+# coding=utf-8
+# Copyright 2020-present Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Funnel Transformer model."""
+
+import os
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from .configuration_funnel import FunnelConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+INF = 1e6
+
+
+def load_tf_weights_in_funnel(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    _layer_map = {
+        "k": "k_head",
+        "q": "q_head",
+        "v": "v_head",
+        "o": "post_proj",
+        "layer_1": "linear_1",
+        "layer_2": "linear_2",
+        "rel_attn": "attention",
+        "ff": "ffn",
+        "kernel": "weight",
+        "gamma": "weight",
+        "beta": "bias",
+        "lookup_table": "weight",
+        "word_embedding": "word_embeddings",
+        "input": "embeddings",
+    }
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        if name[0] == "generator":
+            continue
+        pointer = model
+        skipped = False
+        for m_name in name[1:]:
+            if not isinstance(pointer, FunnelPositionwiseFFN) and re.fullmatch(r"layer_\d+", m_name):
+                layer_index = int(re.search(r"layer_(\d+)", m_name).groups()[0])
+                if layer_index < config.num_hidden_layers:
+                    block_idx = 0
+                    while layer_index >= config.block_sizes[block_idx]:
+                        layer_index -= config.block_sizes[block_idx]
+                        block_idx += 1
+                    pointer = pointer.blocks[block_idx][layer_index]
+                else:
+                    layer_index -= config.num_hidden_layers
+                    pointer = pointer.layers[layer_index]
+            elif m_name == "r" and isinstance(pointer, FunnelRelMultiheadAttention):
+                pointer = pointer.r_kernel
+                break
+            elif m_name in _layer_map:
+                pointer = getattr(pointer, _layer_map[m_name])
+            else:
+                try:
+                    pointer = getattr(pointer, m_name)
+                except AttributeError:
+                    print(f"Skipping {'/'.join(name)}", array.shape)
+                    skipped = True
+                    break
+        if not skipped:
+            if len(pointer.shape) != len(array.shape):
+                array = array.reshape(pointer.shape)
+            if m_name == "kernel":
+                array = np.transpose(array)
+            pointer.data = torch.from_numpy(array)
+
+    return model
+
+
+class FunnelEmbeddings(nn.Module):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(
+        self, input_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = self.layer_norm(inputs_embeds)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class FunnelAttentionStructure(nn.Module):
+    """
+    Contains helpers for `FunnelRelMultiheadAttention `.
+    """
+
+    cls_token_type_id: int = 2
+
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.sin_dropout = nn.Dropout(config.hidden_dropout)
+        self.cos_dropout = nn.Dropout(config.hidden_dropout)
+        # Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
+        # divided.
+        self.pooling_mult = None
+
+    def init_attention_inputs(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        """Returns the attention inputs associated to the inputs of the model."""
+        # inputs_embeds has shape batch_size x seq_len x d_model
+        # attention_mask and token_type_ids have shape batch_size x seq_len
+        self.pooling_mult = 1
+        self.seq_len = seq_len = inputs_embeds.size(1)
+        position_embeds = self.get_position_embeds(seq_len, inputs_embeds.dtype, inputs_embeds.device)
+        token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None
+        cls_mask = (
+            nn.functional.pad(inputs_embeds.new_ones([seq_len - 1, seq_len - 1]), (1, 0, 1, 0))
+            if self.config.separate_cls
+            else None
+        )
+        return (position_embeds, token_type_mat, attention_mask, cls_mask)
+
+    def token_type_ids_to_mat(self, token_type_ids: torch.Tensor) -> torch.Tensor:
+        """Convert `token_type_ids` to `token_type_mat`."""
+        token_type_mat = token_type_ids[:, :, None] == token_type_ids[:, None]
+        # Treat <cls> as in the same segment as both A & B
+        cls_ids = token_type_ids == self.cls_token_type_id
+        cls_mat = cls_ids[:, :, None] | cls_ids[:, None]
+        return cls_mat | token_type_mat
+
+    def get_position_embeds(
+        self, seq_len: int, dtype: torch.dtype, device: torch.device
+    ) -> Union[tuple[torch.Tensor], list[list[torch.Tensor]]]:
+        """
+        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
+        are using the factorized or the relative shift attention:
+
+        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
+        final formula.
+
+        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
+        formula.
+
+        Paper link: https://huggingface.co/papers/2006.03236
+        """
+        d_model = self.config.d_model
+        if self.config.attention_type == "factorized":
+            # Notations from the paper, appending A.2.2, final formula.
+            # We need to create and return the matrices phi, psi, pi and omega.
+            pos_seq = torch.arange(0, seq_len, 1.0, dtype=torch.int64, device=device).to(dtype)
+            freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=torch.int64, device=device).to(dtype)
+            inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
+            sinusoid = pos_seq[:, None] * inv_freq[None]
+            sin_embed = torch.sin(sinusoid)
+            sin_embed_d = self.sin_dropout(sin_embed)
+            cos_embed = torch.cos(sinusoid)
+            cos_embed_d = self.cos_dropout(cos_embed)
+            # This is different from the formula on the paper...
+            phi = torch.cat([sin_embed_d, sin_embed_d], dim=-1)
+            psi = torch.cat([cos_embed, sin_embed], dim=-1)
+            pi = torch.cat([cos_embed_d, cos_embed_d], dim=-1)
+            omega = torch.cat([-sin_embed, cos_embed], dim=-1)
+            return (phi, pi, psi, omega)
+        else:
+            # Notations from the paper, appending A.2.1, final formula.
+            # We need to create and return all the possible vectors R for all blocks and shifts.
+            freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=torch.int64, device=device).to(dtype)
+            inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
+            # Maximum relative positions for the first input
+            rel_pos_id = torch.arange(-seq_len * 2, seq_len * 2, 1.0, dtype=torch.int64, device=device).to(dtype)
+            zero_offset = seq_len * 2
+            sinusoid = rel_pos_id[:, None] * inv_freq[None]
+            sin_embed = self.sin_dropout(torch.sin(sinusoid))
+            cos_embed = self.cos_dropout(torch.cos(sinusoid))
+            pos_embed = torch.cat([sin_embed, cos_embed], dim=-1)
+
+            pos = torch.arange(0, seq_len, dtype=torch.int64, device=device).to(dtype)
+            pooled_pos = pos
+            position_embeds_list = []
+            for block_index in range(0, self.config.num_blocks):
+                # For each block with block_index > 0, we need two types position embeddings:
+                #   - Attention(pooled-q, unpooled-kv)
+                #   - Attention(pooled-q, pooled-kv)
+                # For block_index = 0 we only need the second one and leave the first one as None.
+
+                # First type
+                if block_index == 0:
+                    position_embeds_pooling = None
+                else:
+                    pooled_pos = self.stride_pool_pos(pos, block_index)
+
+                    # construct rel_pos_id
+                    stride = 2 ** (block_index - 1)
+                    rel_pos = self.relative_pos(pos, stride, pooled_pos, shift=2)
+                    rel_pos = rel_pos[:, None] + zero_offset
+                    rel_pos = rel_pos.expand(rel_pos.size(0), d_model)
+                    position_embeds_pooling = torch.gather(pos_embed, 0, rel_pos)
+
+                # Second type
+                pos = pooled_pos
+                stride = 2**block_index
+                rel_pos = self.relative_pos(pos, stride)
+
+                rel_pos = rel_pos[:, None] + zero_offset
+                rel_pos = rel_pos.expand(rel_pos.size(0), d_model)
+                position_embeds_no_pooling = torch.gather(pos_embed, 0, rel_pos)
+
+                position_embeds_list.append([position_embeds_no_pooling, position_embeds_pooling])
+            return position_embeds_list
+
+    def stride_pool_pos(self, pos_id: torch.Tensor, block_index: int):
+        """
+        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
+        """
+        if self.config.separate_cls:
+            # Under separate <cls>, we treat the <cls> as the first token in
+            # the previous block of the 1st real block. Since the 1st real
+            # block always has position 1, the position of the previous block
+            # will be at `1 - 2 ** block_index`.
+            cls_pos = pos_id.new_tensor([-(2**block_index) + 1])
+            pooled_pos_id = pos_id[1:-1] if self.config.truncate_seq else pos_id[1:]
+            return torch.cat([cls_pos, pooled_pos_id[::2]], 0)
+        else:
+            return pos_id[::2]
+
+    def relative_pos(self, pos: torch.Tensor, stride: int, pooled_pos=None, shift: int = 1) -> torch.Tensor:
+        """
+        Build the relative positional vector between `pos` and `pooled_pos`.
+        """
+        if pooled_pos is None:
+            pooled_pos = pos
+
+        ref_point = pooled_pos[0] - pos[0]
+        num_remove = shift * len(pooled_pos)
+        max_dist = ref_point + num_remove * stride
+        min_dist = pooled_pos[0] - pos[-1]
+
+        return torch.arange(max_dist, min_dist - 1, -stride, dtype=torch.long, device=pos.device)
+
+    def stride_pool(
+        self,
+        tensor: Union[torch.Tensor, tuple[torch.Tensor], list[torch.Tensor]],
+        axis: Union[int, tuple[int], list[int]],
+    ) -> torch.Tensor:
+        """
+        Perform pooling by stride slicing the tensor along the given axis.
+        """
+        if tensor is None:
+            return None
+
+        # Do the stride pool recursively if axis is a list or a tuple of ints.
+        if isinstance(axis, (list, tuple)):
+            for ax in axis:
+                tensor = self.stride_pool(tensor, ax)
+            return tensor
+
+        # Do the stride pool recursively if tensor is a list or tuple of tensors.
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(self.stride_pool(x, axis) for x in tensor)
+
+        # Deal with negative axis
+        axis %= tensor.ndim
+
+        axis_slice = (
+            slice(None, -1, 2) if self.config.separate_cls and self.config.truncate_seq else slice(None, None, 2)
+        )
+        enc_slice = [slice(None)] * axis + [axis_slice]
+        if self.config.separate_cls:
+            cls_slice = [slice(None)] * axis + [slice(None, 1)]
+            tensor = torch.cat([tensor[cls_slice], tensor], axis=axis)
+        return tensor[enc_slice]
+
+    def pool_tensor(
+        self, tensor: Union[torch.Tensor, tuple[torch.Tensor], list[torch.Tensor]], mode: str = "mean", stride: int = 2
+    ) -> torch.Tensor:
+        """Apply 1D pooling to a tensor of size [B x T (x H)]."""
+        if tensor is None:
+            return None
+
+        # Do the pool recursively if tensor is a list or tuple of tensors.
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(self.pool_tensor(tensor, mode=mode, stride=stride) for x in tensor)
+
+        if self.config.separate_cls:
+            suffix = tensor[:, :-1] if self.config.truncate_seq else tensor
+            tensor = torch.cat([tensor[:, :1], suffix], dim=1)
+
+        ndim = tensor.ndim
+        if ndim == 2:
+            tensor = tensor[:, None, :, None]
+        elif ndim == 3:
+            tensor = tensor[:, None, :, :]
+        # Stride is applied on the second-to-last dimension.
+        stride = (stride, 1)
+
+        if mode == "mean":
+            tensor = nn.functional.avg_pool2d(tensor, stride, stride=stride, ceil_mode=True)
+        elif mode == "max":
+            tensor = nn.functional.max_pool2d(tensor, stride, stride=stride, ceil_mode=True)
+        elif mode == "min":
+            tensor = -nn.functional.max_pool2d(-tensor, stride, stride=stride, ceil_mode=True)
+        else:
+            raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.")
+
+        if ndim == 2:
+            return tensor[:, 0, :, 0]
+        elif ndim == 3:
+            return tensor[:, 0]
+        return tensor
+
+    def pre_attention_pooling(
+        self, output, attention_inputs: tuple[torch.Tensor]
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor]]:
+        """Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+        if self.config.pool_q_only:
+            if self.config.attention_type == "factorized":
+                position_embeds = self.stride_pool(position_embeds[:2], 0) + position_embeds[2:]
+            token_type_mat = self.stride_pool(token_type_mat, 1)
+            cls_mask = self.stride_pool(cls_mask, 0)
+            output = self.pool_tensor(output, mode=self.config.pooling_type)
+        else:
+            self.pooling_mult *= 2
+            if self.config.attention_type == "factorized":
+                position_embeds = self.stride_pool(position_embeds, 0)
+            token_type_mat = self.stride_pool(token_type_mat, [1, 2])
+            cls_mask = self.stride_pool(cls_mask, [1, 2])
+            attention_mask = self.pool_tensor(attention_mask, mode="min")
+            output = self.pool_tensor(output, mode=self.config.pooling_type)
+        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
+        return output, attention_inputs
+
+    def post_attention_pooling(self, attention_inputs: tuple[torch.Tensor]) -> tuple[torch.Tensor]:
+        """Pool the proper parts of `attention_inputs` after the attention layer."""
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+        if self.config.pool_q_only:
+            self.pooling_mult *= 2
+            if self.config.attention_type == "factorized":
+                position_embeds = position_embeds[:2] + self.stride_pool(position_embeds[2:], 0)
+            token_type_mat = self.stride_pool(token_type_mat, 2)
+            cls_mask = self.stride_pool(cls_mask, 1)
+            attention_mask = self.pool_tensor(attention_mask, mode="min")
+        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
+        return attention_inputs
+
+
+def _relative_shift_gather(positional_attn: torch.Tensor, context_len: int, shift: int) -> torch.Tensor:
+    batch_size, n_head, seq_len, max_rel_len = positional_attn.shape
+    # max_rel_len = 2 * context_len + shift -1 is the numbers of possible relative positions i-j
+
+    # What's next is the same as doing the following gather, which might be clearer code but less efficient.
+    # idxs = context_len + torch.arange(0, context_len).unsqueeze(0) - torch.arange(0, seq_len).unsqueeze(1)
+    # # matrix of context_len + i-j
+    # return positional_attn.gather(3, idxs.expand([batch_size, n_head, context_len, context_len]))
+
+    positional_attn = torch.reshape(positional_attn, [batch_size, n_head, max_rel_len, seq_len])
+    positional_attn = positional_attn[:, :, shift:, :]
+    positional_attn = torch.reshape(positional_attn, [batch_size, n_head, seq_len, max_rel_len - shift])
+    positional_attn = positional_attn[..., :context_len]
+    return positional_attn
+
+
+class FunnelRelMultiheadAttention(nn.Module):
+    def __init__(self, config: FunnelConfig, block_index: int) -> None:
+        super().__init__()
+        self.config = config
+        self.block_index = block_index
+        d_model, n_head, d_head = config.d_model, config.n_head, config.d_head
+
+        self.hidden_dropout = nn.Dropout(config.hidden_dropout)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+        self.q_head = nn.Linear(d_model, n_head * d_head, bias=False)
+        self.k_head = nn.Linear(d_model, n_head * d_head)
+        self.v_head = nn.Linear(d_model, n_head * d_head)
+
+        self.r_w_bias = nn.Parameter(torch.zeros([n_head, d_head]))
+        self.r_r_bias = nn.Parameter(torch.zeros([n_head, d_head]))
+        self.r_kernel = nn.Parameter(torch.zeros([d_model, n_head, d_head]))
+        self.r_s_bias = nn.Parameter(torch.zeros([n_head, d_head]))
+        self.seg_embed = nn.Parameter(torch.zeros([2, n_head, d_head]))
+
+        self.post_proj = nn.Linear(n_head * d_head, d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=config.layer_norm_eps)
+        self.scale = 1.0 / (d_head**0.5)
+
+    def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
+        """Relative attention score for the positional encodings"""
+        # q_head has shape batch_size x sea_len x n_head x d_head
+        if self.config.attention_type == "factorized":
+            # Notations from the paper, appending A.2.2, final formula (https://huggingface.co/papers/2006.03236)
+            # phi and pi have shape seq_len x d_model, psi and omega have shape context_len x d_model
+            phi, pi, psi, omega = position_embeds
+            # Shape n_head x d_head
+            u = self.r_r_bias * self.scale
+            # Shape d_model x n_head x d_head
+            w_r = self.r_kernel
+
+            # Shape batch_size x sea_len x n_head x d_model
+            q_r_attention = torch.einsum("binh,dnh->bind", q_head + u, w_r)
+            q_r_attention_1 = q_r_attention * phi[:, None]
+            q_r_attention_2 = q_r_attention * pi[:, None]
+
+            # Shape batch_size x n_head x seq_len x context_len
+            positional_attn = torch.einsum("bind,jd->bnij", q_r_attention_1, psi) + torch.einsum(
+                "bind,jd->bnij", q_r_attention_2, omega
+            )
+        else:
+            shift = 2 if q_head.shape[1] != context_len else 1
+            # Notations from the paper, appending A.2.1, final formula (https://huggingface.co/papers/2006.03236)
+            # Grab the proper positional encoding, shape max_rel_len x d_model
+            r = position_embeds[self.block_index][shift - 1]
+            # Shape n_head x d_head
+            v = self.r_r_bias * self.scale
+            # Shape d_model x n_head x d_head
+            w_r = self.r_kernel
+
+            # Shape max_rel_len x n_head x d_model
+            r_head = torch.einsum("td,dnh->tnh", r, w_r)
+            # Shape batch_size x n_head x seq_len x max_rel_len
+            positional_attn = torch.einsum("binh,tnh->bnit", q_head + v, r_head)
+            # Shape batch_size x n_head x seq_len x context_len
+            positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
+
+        if cls_mask is not None:
+            positional_attn *= cls_mask
+        return positional_attn
+
+    def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
+        """Relative attention score for the token_type_ids"""
+        if token_type_mat is None:
+            return 0
+        batch_size, seq_len, context_len = token_type_mat.shape
+        # q_head has shape batch_size x seq_len x n_head x d_head
+        # Shape n_head x d_head
+        r_s_bias = self.r_s_bias * self.scale
+
+        # Shape batch_size x n_head x seq_len x 2
+        token_type_bias = torch.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed)
+        # Shape batch_size x n_head x seq_len x context_len
+        token_type_mat = token_type_mat[:, None].expand([batch_size, q_head.shape[2], seq_len, context_len])
+        # Shapes batch_size x n_head x seq_len
+        diff_token_type, same_token_type = torch.split(token_type_bias, 1, dim=-1)
+        # Shape batch_size x n_head x seq_len x context_len
+        token_type_attn = torch.where(
+            token_type_mat, same_token_type.expand(token_type_mat.shape), diff_token_type.expand(token_type_mat.shape)
+        )
+
+        if cls_mask is not None:
+            token_type_attn *= cls_mask
+        return token_type_attn
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_inputs: tuple[torch.Tensor],
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, ...]:
+        # query has shape batch_size x seq_len x d_model
+        # key and value have shapes batch_size x context_len x d_model
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+
+        batch_size, seq_len, _ = query.shape
+        context_len = key.shape[1]
+        n_head, d_head = self.config.n_head, self.config.d_head
+
+        # Shape batch_size x seq_len x n_head x d_head
+        q_head = self.q_head(query).view(batch_size, seq_len, n_head, d_head)
+        # Shapes batch_size x context_len x n_head x d_head
+        k_head = self.k_head(key).view(batch_size, context_len, n_head, d_head)
+        v_head = self.v_head(value).view(batch_size, context_len, n_head, d_head)
+
+        q_head = q_head * self.scale
+        # Shape n_head x d_head
+        r_w_bias = self.r_w_bias * self.scale
+        # Shapes batch_size x n_head x seq_len x context_len
+        content_score = torch.einsum("bind,bjnd->bnij", q_head + r_w_bias, k_head)
+        positional_attn = self.relative_positional_attention(position_embeds, q_head, context_len, cls_mask)
+        token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)
+
+        # merge attention scores
+        attn_score = content_score + positional_attn + token_type_attn
+
+        # precision safe in case of mixed precision training
+        dtype = attn_score.dtype
+        attn_score = attn_score.float()
+        # perform masking
+        if attention_mask is not None:
+            attn_score = attn_score - INF * (1 - attention_mask[:, None, None].float())
+        # attention probability
+        attn_prob = torch.softmax(attn_score, dim=-1, dtype=dtype)
+        attn_prob = self.attention_dropout(attn_prob)
+
+        # attention output, shape batch_size x seq_len x n_head x d_head
+        attn_vec = torch.einsum("bnij,bjnd->bind", attn_prob, v_head)
+
+        # Shape shape batch_size x seq_len x d_model
+        attn_out = self.post_proj(attn_vec.reshape(batch_size, seq_len, n_head * d_head))
+        attn_out = self.hidden_dropout(attn_out)
+
+        output = self.layer_norm(query + attn_out)
+        return (output, attn_prob) if output_attentions else (output,)
+
+
+class FunnelPositionwiseFFN(nn.Module):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__()
+        self.linear_1 = nn.Linear(config.d_model, config.d_inner)
+        self.activation_function = ACT2FN[config.hidden_act]
+        self.activation_dropout = nn.Dropout(config.activation_dropout)
+        self.linear_2 = nn.Linear(config.d_inner, config.d_model)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+
+    def forward(self, hidden: torch.Tensor) -> torch.Tensor:
+        h = self.linear_1(hidden)
+        h = self.activation_function(h)
+        h = self.activation_dropout(h)
+        h = self.linear_2(h)
+        h = self.dropout(h)
+        return self.layer_norm(hidden + h)
+
+
+class FunnelLayer(nn.Module):
+    def __init__(self, config: FunnelConfig, block_index: int) -> None:
+        super().__init__()
+        self.attention = FunnelRelMultiheadAttention(config, block_index)
+        self.ffn = FunnelPositionwiseFFN(config)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_inputs,
+        output_attentions: bool = False,
+    ) -> tuple:
+        attn = self.attention(query, key, value, attention_inputs, output_attentions=output_attentions)
+        output = self.ffn(attn[0])
+        return (output, attn[1]) if output_attentions else (output,)
+
+
+class FunnelEncoder(nn.Module):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.attention_structure = FunnelAttentionStructure(config)
+        self.blocks = nn.ModuleList(
+            [
+                nn.ModuleList([FunnelLayer(config, block_index) for _ in range(block_size)])
+                for block_index, block_size in enumerate(config.block_sizes)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        # The pooling is not implemented on long tensors, so we convert this mask.
+        attention_mask = attention_mask.type_as(inputs_embeds)
+        attention_inputs = self.attention_structure.init_attention_inputs(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+        )
+        hidden = inputs_embeds
+
+        all_hidden_states = (inputs_embeds,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for block_index, block in enumerate(self.blocks):
+            pooling_flag = hidden.size(1) > (2 if self.config.separate_cls else 1)
+            pooling_flag = pooling_flag and block_index > 0
+            if pooling_flag:
+                pooled_hidden, attention_inputs = self.attention_structure.pre_attention_pooling(
+                    hidden, attention_inputs
+                )
+            for layer_index, layer in enumerate(block):
+                for repeat_index in range(self.config.block_repeats[block_index]):
+                    do_pooling = (repeat_index == 0) and (layer_index == 0) and pooling_flag
+                    if do_pooling:
+                        query = pooled_hidden
+                        key = value = hidden if self.config.pool_q_only else pooled_hidden
+                    else:
+                        query = key = value = hidden
+                    layer_output = layer(query, key, value, attention_inputs, output_attentions=output_attentions)
+                    hidden = layer_output[0]
+                    if do_pooling:
+                        attention_inputs = self.attention_structure.post_attention_pooling(attention_inputs)
+
+                    if output_attentions:
+                        all_attentions = all_attentions + layer_output[1:]
+                    if output_hidden_states:
+                        all_hidden_states = all_hidden_states + (hidden,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
+
+
+def upsample(
+    x: torch.Tensor, stride: int, target_len: int, separate_cls: bool = True, truncate_seq: bool = False
+) -> torch.Tensor:
+    """
+    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
+    """
+    if stride == 1:
+        return x
+    if separate_cls:
+        cls = x[:, :1]
+        x = x[:, 1:]
+    output = torch.repeat_interleave(x, repeats=stride, dim=1)
+    if separate_cls:
+        if truncate_seq:
+            output = nn.functional.pad(output, (0, 0, 0, stride - 1, 0, 0))
+        output = output[:, : target_len - 1]
+        output = torch.cat([cls, output], dim=1)
+    else:
+        output = output[:, :target_len]
+    return output
+
+
+class FunnelDecoder(nn.Module):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.attention_structure = FunnelAttentionStructure(config)
+        self.layers = nn.ModuleList([FunnelLayer(config, 0) for _ in range(config.num_decoder_layers)])
+
+    def forward(
+        self,
+        final_hidden: torch.Tensor,
+        first_block_hidden: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        upsampled_hidden = upsample(
+            final_hidden,
+            stride=2 ** (len(self.config.block_sizes) - 1),
+            target_len=first_block_hidden.shape[1],
+            separate_cls=self.config.separate_cls,
+            truncate_seq=self.config.truncate_seq,
+        )
+
+        hidden = upsampled_hidden + first_block_hidden
+        all_hidden_states = (hidden,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        attention_inputs = self.attention_structure.init_attention_inputs(
+            hidden,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+        )
+
+        for layer in self.layers:
+            layer_output = layer(hidden, hidden, hidden, attention_inputs, output_attentions=output_attentions)
+            hidden = layer_output[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + layer_output[1:]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
+
+
+class FunnelDiscriminatorPredictions(nn.Module):
+    """Prediction module for the discriminator, made up of two dense layers."""
+
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dense_prediction = nn.Linear(config.d_model, 1)
+
+    def forward(self, discriminator_hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
+        logits = self.dense_prediction(hidden_states).squeeze(-1)
+        return logits
+
+
+@auto_docstring
+class FunnelPreTrainedModel(PreTrainedModel):
+    config: FunnelConfig
+    load_tf_weights = load_tf_weights_in_funnel
+    base_model_prefix = "funnel"
+
+    def _init_weights(self, module):
+        classname = module.__class__.__name__
+        if classname.find("Linear") != -1:
+            if getattr(module, "weight", None) is not None:
+                if self.config.initializer_std is None:
+                    fan_out, fan_in = module.weight.shape
+                    std = np.sqrt(1.0 / float(fan_in + fan_out))
+                else:
+                    std = self.config.initializer_std
+                nn.init.normal_(module.weight, std=std)
+            if getattr(module, "bias", None) is not None:
+                nn.init.constant_(module.bias, 0.0)
+        elif classname == "FunnelRelMultiheadAttention":
+            nn.init.uniform_(module.r_w_bias, b=self.config.initializer_range)
+            nn.init.uniform_(module.r_r_bias, b=self.config.initializer_range)
+            nn.init.uniform_(module.r_kernel, b=self.config.initializer_range)
+            nn.init.uniform_(module.r_s_bias, b=self.config.initializer_range)
+            nn.init.uniform_(module.seg_embed, b=self.config.initializer_range)
+        elif classname == "FunnelEmbeddings":
+            std = 1.0 if self.config.initializer_std is None else self.config.initializer_std
+            nn.init.normal_(module.word_embeddings.weight, std=std)
+            if module.word_embeddings.padding_idx is not None:
+                module.word_embeddings.weight.data[module.word_embeddings.padding_idx].zero_()
+
+
+class FunnelClassificationHead(nn.Module):
+    def __init__(self, config: FunnelConfig, n_labels: int) -> None:
+        super().__init__()
+        self.linear_hidden = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.linear_out = nn.Linear(config.d_model, n_labels)
+
+    def forward(self, hidden: torch.Tensor) -> torch.Tensor:
+        hidden = self.linear_hidden(hidden)
+        hidden = torch.tanh(hidden)
+        hidden = self.dropout(hidden)
+        return self.linear_out(hidden)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`FunnelForPreTraining`].
+    """
+)
+class FunnelForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Total loss of the ELECTRA-style objective.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        Prediction scores of the head (scores for each token before SoftMax).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
+    decoder) or any task-specific head on top.
+    """
+)
+class FunnelBaseModel(FunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+
+        self.embeddings = FunnelEmbeddings(config)
+        self.encoder = FunnelEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
+        self.embeddings.word_embeddings = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # TODO: deal with head_mask
+        inputs_embeds = self.embeddings(input_ids, inputs_embeds=inputs_embeds)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
+
+
+@auto_docstring
+class FunnelModel(FunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.embeddings = FunnelEmbeddings(config)
+        self.encoder = FunnelEncoder(config)
+        self.decoder = FunnelDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
+        self.embeddings.word_embeddings = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # TODO: deal with head_mask
+        inputs_embeds = self.embeddings(input_ids, inputs_embeds=inputs_embeds)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+
+        decoder_outputs = self.decoder(
+            final_hidden=encoder_outputs[0],
+            first_block_hidden=encoder_outputs[1][self.config.block_sizes[0]],
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            idx = 0
+            outputs = (decoder_outputs[0],)
+            if output_hidden_states:
+                idx += 1
+                outputs = outputs + (encoder_outputs[1] + decoder_outputs[idx],)
+            if output_attentions:
+                idx += 1
+                outputs = outputs + (encoder_outputs[2] + decoder_outputs[idx],)
+            return outputs
+
+        return BaseModelOutput(
+            last_hidden_state=decoder_outputs[0],
+            hidden_states=(encoder_outputs.hidden_states + decoder_outputs.hidden_states)
+            if output_hidden_states
+            else None,
+            attentions=(encoder_outputs.attentions + decoder_outputs.attentions) if output_attentions else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
+    generated tokens.
+    """
+)
+class FunnelForPreTraining(FunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+
+        self.funnel = FunnelModel(config)
+        self.discriminator_predictions = FunnelDiscriminatorPredictions(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FunnelForPreTrainingOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
+            docstring) Indices should be in `[0, 1]`:
+
+            - 0 indicates the token is an original token,
+            - 1 indicates the token was replaced.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FunnelForPreTraining
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
+        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> logits = model(**inputs).logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        discriminator_hidden_states = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1
+                active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss]
+                active_labels = labels[active_loss]
+                loss = loss_fct(active_logits, active_labels.float())
+            else:
+                loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float())
+
+        if not return_dict:
+            output = (logits,) + discriminator_hidden_states[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return FunnelForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+
+@auto_docstring
+class FunnelForMaskedLM(FunnelPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+
+        self.funnel = FunnelModel(config)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: nn.Embedding) -> None:
+        self.lm_head = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        prediction_logits = self.lm_head(last_hidden_state)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
+    first timestep of the last hidden state) e.g. for GLUE tasks.
+    """
+)
+class FunnelForSequenceClassification(FunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.funnel = FunnelBaseModel(config)
+        self.classifier = FunnelClassificationHead(config, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        pooled_output = last_hidden_state[:, 0]
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class FunnelForMultipleChoice(FunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+
+        self.funnel = FunnelBaseModel(config)
+        self.classifier = FunnelClassificationHead(config, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        pooled_output = last_hidden_state[:, 0]
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class FunnelForTokenClassification(FunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.funnel = FunnelModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        last_hidden_state = self.dropout(last_hidden_state)
+        logits = self.classifier(last_hidden_state)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class FunnelForQuestionAnswering(FunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.funnel = FunnelModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+
+        logits = self.qa_outputs(last_hidden_state)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "FunnelBaseModel",
+    "FunnelForMaskedLM",
+    "FunnelForMultipleChoice",
+    "FunnelForPreTraining",
+    "FunnelForQuestionAnswering",
+    "FunnelForSequenceClassification",
+    "FunnelForTokenClassification",
+    "FunnelModel",
+    "FunnelPreTrainedModel",
+    "load_tf_weights_in_funnel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/modeling_tf_funnel.py b/phivenv/Lib/site-packages/transformers/models/funnel/modeling_tf_funnel.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d57fa99eaa14a38214da8200b0af768aef9ddaf
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/funnel/modeling_tf_funnel.py
@@ -0,0 +1,1883 @@
+# coding=utf-8
+# Copyright 2020-present Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Funnel model."""
+
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_funnel import FunnelConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "FunnelConfig"
+
+
+INF = 1e6
+
+
+class TFFunnelEmbeddings(keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.initializer_std = 1.0 if config.initializer_std is None else config.initializer_std
+
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.hidden_size],
+                initializer=get_initializer(initializer_range=self.initializer_std),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.d_model])
+
+    def call(self, input_ids=None, inputs_embeds=None, training=False):
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+        assert not (input_ids is not None and inputs_embeds is not None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(self.weight, input_ids)
+
+        final_embeddings = self.LayerNorm(inputs=inputs_embeds)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+class TFFunnelAttentionStructure:
+    """
+    Contains helpers for `TFFunnelRelMultiheadAttention `.
+    """
+
+    cls_token_type_id: int = 2
+
+    def __init__(self, config):
+        self.d_model = config.d_model
+        self.attention_type = config.attention_type
+        self.num_blocks = config.num_blocks
+        self.separate_cls = config.separate_cls
+        self.truncate_seq = config.truncate_seq
+        self.pool_q_only = config.pool_q_only
+        self.pooling_type = config.pooling_type
+
+        self.sin_dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.cos_dropout = keras.layers.Dropout(config.hidden_dropout)
+        # Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
+        # divided.
+        self.pooling_mult = None
+
+    def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False):
+        """Returns the attention inputs associated to the inputs of the model."""
+        # inputs_embeds has shape batch_size x seq_len x d_model
+        # attention_mask and token_type_ids have shape batch_size x seq_len
+        self.pooling_mult = 1
+        self.seq_len = seq_len = shape_list(inputs_embeds)[1]
+        position_embeds = self.get_position_embeds(seq_len, training=training)
+        token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None
+        cls_mask = (
+            tf.pad(tf.ones([seq_len - 1, seq_len - 1], dtype=inputs_embeds.dtype), [[1, 0], [1, 0]])
+            if self.separate_cls
+            else None
+        )
+        return (position_embeds, token_type_mat, attention_mask, cls_mask)
+
+    def token_type_ids_to_mat(self, token_type_ids):
+        """Convert `token_type_ids` to `token_type_mat`."""
+        token_type_mat = tf.equal(tf.expand_dims(token_type_ids, -1), tf.expand_dims(token_type_ids, -2))
+        # Treat <cls> as in the same segment as both A & B
+        cls_ids = tf.equal(token_type_ids, tf.constant([self.cls_token_type_id], dtype=token_type_ids.dtype))
+        cls_mat = tf.logical_or(tf.expand_dims(cls_ids, -1), tf.expand_dims(cls_ids, -2))
+        return tf.logical_or(cls_mat, token_type_mat)
+
+    def get_position_embeds(self, seq_len, training=False):
+        """
+        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
+        are using the factorized or the relative shift attention:
+
+        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
+        final formula.
+
+        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
+        formula.
+
+        Paper link: https://huggingface.co/papers/2006.03236
+        """
+        if self.attention_type == "factorized":
+            # Notations from the paper, appending A.2.2, final formula.
+            # We need to create and return the matrices phi, psi, pi and omega.
+            pos_seq = tf.range(0, seq_len, 1.0)
+            freq_seq = tf.range(0, self.d_model // 2, 1.0)
+            inv_freq = 1 / (10000 ** (freq_seq / (self.d_model // 2)))
+            sinusoid = tf.einsum("i,d->id", pos_seq, inv_freq)
+
+            sin_embed = tf.sin(sinusoid)
+            sin_embed_d = self.sin_dropout(sin_embed, training=training)
+            cos_embed = tf.cos(sinusoid)
+            cos_embed_d = self.cos_dropout(cos_embed, training=training)
+            # This is different from the formula on the paper...
+            phi = tf.concat([sin_embed_d, sin_embed_d], axis=-1)
+            psi = tf.concat([cos_embed, sin_embed], axis=-1)
+            pi = tf.concat([cos_embed_d, cos_embed_d], axis=-1)
+            omega = tf.concat([-sin_embed, cos_embed], axis=-1)
+            return (phi, pi, psi, omega)
+        else:
+            # Notations from the paper, appending A.2.1, final formula.
+            # We need to create and return all the possible vectors R for all blocks and shifts.
+            freq_seq = tf.range(0, self.d_model // 2, 1.0)
+            inv_freq = 1 / (10000 ** (freq_seq / (self.d_model // 2)))
+            # Maximum relative positions for the first input
+            rel_pos_id = tf.range(-seq_len * 2, seq_len * 2, 1.0)
+            zero_offset = seq_len * tf.constant(2)
+            sinusoid = tf.einsum("i,d->id", rel_pos_id, inv_freq)
+            sin_embed = self.sin_dropout(tf.sin(sinusoid), training=training)
+            cos_embed = self.cos_dropout(tf.cos(sinusoid), training=training)
+            pos_embed = tf.concat([sin_embed, cos_embed], axis=-1)
+
+            pos = tf.range(0, seq_len)
+            pooled_pos = pos
+            position_embeds_list = []
+            for block_index in range(0, self.num_blocks):
+                # For each block with block_index > 0, we need two types position embeddings:
+                #   - Attention(pooled-q, unpooled-kv)
+                #   - Attention(pooled-q, pooled-kv)
+                # For block_index = 0 we only need the second one and leave the first one as None.
+
+                # First type
+                position_embeds_pooling = tf.fill([1], value=-1.0)
+
+                if block_index != 0:
+                    pooled_pos = self.stride_pool_pos(pos, block_index)
+
+                    # construct rel_pos_id
+                    stride = 2 ** (block_index - 1)
+                    rel_pos = self.relative_pos(pos, stride, pooled_pos, shift=2)
+                    # rel_pos = tf.expand_dims(rel_pos,1) + zero_offset
+                    # rel_pos = tf.broadcast_to(rel_pos, (rel_pos.shape[0], self.d_model))
+                    rel_pos = tf.cast(rel_pos, dtype=zero_offset.dtype)
+                    rel_pos = rel_pos + zero_offset
+                    position_embeds_pooling = tf.gather(pos_embed, rel_pos, axis=0)
+
+                # Second type
+                pos = pooled_pos
+                stride = 2**block_index
+                rel_pos = self.relative_pos(pos, stride)
+
+                # rel_pos = tf.expand_dims(rel_pos,1) + zero_offset
+                # rel_pos = tf.broadcast_to(rel_pos, (rel_pos.shape[0], self.d_model))
+                rel_pos = tf.cast(rel_pos, dtype=zero_offset.dtype)
+                rel_pos = rel_pos + zero_offset
+                tf.debugging.assert_less(rel_pos, tf.shape(pos_embed)[0])
+                position_embeds_no_pooling = tf.gather(pos_embed, rel_pos, axis=0)
+
+                position_embeds_list.append([position_embeds_no_pooling, position_embeds_pooling])
+            return position_embeds_list
+
+    def stride_pool_pos(self, pos_id, block_index):
+        """
+        Pool `pos_id` while keeping the cls token separate (if `self.separate_cls=True`).
+        """
+        if self.separate_cls:
+            # Under separate <cls>, we treat the <cls> as the first token in
+            # the previous block of the 1st real block. Since the 1st real
+            # block always has position 1, the position of the previous block
+            # will be at `1 - 2 ** block_index`.
+            cls_pos = tf.constant([-(2**block_index) + 1], dtype=pos_id.dtype)
+            pooled_pos_id = pos_id[1:-1] if self.truncate_seq else pos_id[1:]
+            return tf.concat([cls_pos, pooled_pos_id[::2]], 0)
+        else:
+            return pos_id[::2]
+
+    def relative_pos(self, pos, stride, pooled_pos=None, shift=1):
+        """
+        Build the relative positional vector between `pos` and `pooled_pos`.
+        """
+        if pooled_pos is None:
+            pooled_pos = pos
+
+        ref_point = pooled_pos[0] - pos[0]
+        num_remove = shift * shape_list(pooled_pos)[0]
+        max_dist = ref_point + num_remove * stride
+        min_dist = pooled_pos[0] - pos[-1]
+
+        return tf.range(max_dist, min_dist - 1, -stride)
+
+    def stride_pool(self, tensor, axis):
+        """
+        Perform pooling by stride slicing the tensor along the given axis.
+        """
+        if tensor is None:
+            return None
+
+        # Do the stride pool recursively if axis is a list or a tuple of ints.
+        if isinstance(axis, (list, tuple)):
+            for ax in axis:
+                tensor = self.stride_pool(tensor, ax)
+            return tensor
+
+        # Do the stride pool recursively if tensor is a list or tuple of tensors.
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(self.stride_pool(x, axis) for x in tensor)
+
+        # Deal with negative axis
+        axis %= len(shape_list(tensor))
+
+        axis_slice = slice(None, -1, 2) if self.separate_cls and self.truncate_seq else slice(None, None, 2)
+        enc_slice = [slice(None)] * axis + [axis_slice]
+        if self.separate_cls:
+            cls_slice = [slice(None)] * axis + [slice(None, 1)]
+            tensor = tf.concat([tensor[cls_slice], tensor], axis)
+        return tensor[enc_slice]
+
+    def pool_tensor(self, tensor, mode="mean", stride=2):
+        """Apply 1D pooling to a tensor of size [B x T (x H)]."""
+        if tensor is None:
+            return None
+
+        # Do the pool recursively if tensor is a list or tuple of tensors.
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(self.pool_tensor(tensor, mode=mode, stride=stride) for x in tensor)
+
+        if self.separate_cls:
+            suffix = tensor[:, :-1] if self.truncate_seq else tensor
+            tensor = tf.concat([tensor[:, :1], suffix], axis=1)
+
+        ndim = len(shape_list(tensor))
+        if ndim == 2:
+            tensor = tensor[:, :, None]
+
+        if mode == "mean":
+            tensor = tf.nn.avg_pool1d(tensor, stride, strides=stride, data_format="NWC", padding="SAME")
+        elif mode == "max":
+            tensor = tf.nn.max_pool1d(tensor, stride, strides=stride, data_format="NWC", padding="SAME")
+        elif mode == "min":
+            tensor = -tf.nn.max_pool1d(-tensor, stride, strides=stride, data_format="NWC", padding="SAME")
+        else:
+            raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.")
+
+        return tf.squeeze(tensor, 2) if ndim == 2 else tensor
+
+    def pre_attention_pooling(self, output, attention_inputs):
+        """Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+        if self.pool_q_only:
+            if self.attention_type == "factorized":
+                position_embeds = self.stride_pool(position_embeds[:2], 0) + position_embeds[2:]
+            token_type_mat = self.stride_pool(token_type_mat, 1)
+            cls_mask = self.stride_pool(cls_mask, 0)
+            output = self.pool_tensor(output, mode=self.pooling_type)
+        else:
+            self.pooling_mult *= 2
+            if self.attention_type == "factorized":
+                position_embeds = self.stride_pool(position_embeds, 0)
+            token_type_mat = self.stride_pool(token_type_mat, [1, 2])
+            cls_mask = self.stride_pool(cls_mask, [1, 2])
+            attention_mask = self.pool_tensor(attention_mask, mode="min")
+            output = self.pool_tensor(output, mode=self.pooling_type)
+        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
+        return output, attention_inputs
+
+    def post_attention_pooling(self, attention_inputs):
+        """Pool the proper parts of `attention_inputs` after the attention layer."""
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+        if self.pool_q_only:
+            self.pooling_mult *= 2
+            if self.attention_type == "factorized":
+                position_embeds = position_embeds[:2] + self.stride_pool(position_embeds[2:], 0)
+            token_type_mat = self.stride_pool(token_type_mat, 2)
+            cls_mask = self.stride_pool(cls_mask, 1)
+            attention_mask = self.pool_tensor(attention_mask, mode="min")
+        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
+        return attention_inputs
+
+
+def _relative_shift_gather(positional_attn, context_len, shift):
+    batch_size, n_head, seq_len, max_rel_len = shape_list(positional_attn)
+    # max_rel_len = 2 * context_len + shift -1 is the numbers of possible relative positions i-j
+
+    # What's next is the same as doing the following gather in PyTorch, which might be clearer code but less efficient.
+    # idxs = context_len + torch.arange(0, context_len).unsqueeze(0) - torch.arange(0, seq_len).unsqueeze(1)
+    # # matrix of context_len + i-j
+    # return positional_attn.gather(3, idxs.expand([batch_size, n_head, context_len, context_len]))
+
+    positional_attn = tf.reshape(positional_attn, [batch_size, n_head, max_rel_len, seq_len])
+    positional_attn = positional_attn[:, :, shift:, :]
+    positional_attn = tf.reshape(positional_attn, [batch_size, n_head, seq_len, max_rel_len - shift])
+    positional_attn = positional_attn[..., :context_len]
+    return positional_attn
+
+
+class TFFunnelRelMultiheadAttention(keras.layers.Layer):
+    def __init__(self, config, block_index, **kwargs):
+        super().__init__(**kwargs)
+        self.attention_type = config.attention_type
+        self.n_head = n_head = config.n_head
+        self.d_head = d_head = config.d_head
+        self.d_model = d_model = config.d_model
+        self.initializer_range = config.initializer_range
+        self.block_index = block_index
+
+        self.hidden_dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.attention_dropout = keras.layers.Dropout(config.attention_dropout)
+
+        initializer = get_initializer(config.initializer_range)
+
+        self.q_head = keras.layers.Dense(
+            n_head * d_head, use_bias=False, kernel_initializer=initializer, name="q_head"
+        )
+        self.k_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="k_head")
+        self.v_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="v_head")
+
+        self.post_proj = keras.layers.Dense(d_model, kernel_initializer=initializer, name="post_proj")
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.scale = 1.0 / (d_head**0.5)
+
+    def build(self, input_shape=None):
+        n_head, d_head, d_model = self.n_head, self.d_head, self.d_model
+        initializer = get_initializer(self.initializer_range)
+
+        self.r_w_bias = self.add_weight(
+            shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_w_bias"
+        )
+        self.r_r_bias = self.add_weight(
+            shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_r_bias"
+        )
+        self.r_kernel = self.add_weight(
+            shape=(d_model, n_head, d_head), initializer=initializer, trainable=True, name="r_kernel"
+        )
+        self.r_s_bias = self.add_weight(
+            shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_s_bias"
+        )
+        self.seg_embed = self.add_weight(
+            shape=(2, n_head, d_head), initializer=initializer, trainable=True, name="seg_embed"
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_head", None) is not None:
+            with tf.name_scope(self.q_head.name):
+                self.q_head.build([None, None, d_model])
+        if getattr(self, "k_head", None) is not None:
+            with tf.name_scope(self.k_head.name):
+                self.k_head.build([None, None, d_model])
+        if getattr(self, "v_head", None) is not None:
+            with tf.name_scope(self.v_head.name):
+                self.v_head.build([None, None, d_model])
+        if getattr(self, "post_proj", None) is not None:
+            with tf.name_scope(self.post_proj.name):
+                self.post_proj.build([None, None, n_head * d_head])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, d_model])
+
+    def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
+        """Relative attention score for the positional encodings"""
+        # q_head has shape batch_size x sea_len x n_head x d_head
+        if self.attention_type == "factorized":
+            # Notations from the paper, appending A.2.2, final formula (https://huggingface.co/papers/2006.03236)
+            # phi and pi have shape seq_len x d_model, psi and omega have shape context_len x d_model
+            phi, pi, psi, omega = position_embeds
+            # Shape n_head x d_head
+            u = self.r_r_bias * self.scale
+            # Shape d_model x n_head x d_head
+            w_r = self.r_kernel
+
+            # Shape batch_size x sea_len x n_head x d_model
+            q_r_attention = tf.einsum("binh,dnh->bind", q_head + u, w_r)
+            q_r_attention_1 = q_r_attention * phi[:, None]
+            q_r_attention_2 = q_r_attention * pi[:, None]
+
+            # Shape batch_size x n_head x seq_len x context_len
+            positional_attn = tf.einsum("bind,jd->bnij", q_r_attention_1, psi) + tf.einsum(
+                "bind,jd->bnij", q_r_attention_2, omega
+            )
+        else:
+            # Notations from the paper, appending A.2.1, final formula (https://huggingface.co/papers/2006.03236)
+            # Grab the proper positional encoding, shape max_rel_len x d_model
+            if shape_list(q_head)[1] != context_len:
+                shift = 2
+                r = position_embeds[self.block_index][1]
+            else:
+                shift = 1
+                r = position_embeds[self.block_index][0]
+            # Shape n_head x d_head
+            v = self.r_r_bias * self.scale
+            # Shape d_model x n_head x d_head
+            w_r = self.r_kernel
+
+            # Shape max_rel_len x n_head x d_model
+            r_head = tf.einsum("td,dnh->tnh", r, w_r)
+            # Shape batch_size x n_head x seq_len x max_rel_len
+            positional_attn = tf.einsum("binh,tnh->bnit", q_head + v, r_head)
+            # Shape batch_size x n_head x seq_len x context_len
+            positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
+
+        if cls_mask is not None:
+            positional_attn *= cls_mask
+        return positional_attn
+
+    def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
+        """Relative attention score for the token_type_ids"""
+        if token_type_mat is None:
+            return 0
+        batch_size, seq_len, context_len = shape_list(token_type_mat)
+        # q_head has shape batch_size x seq_len x n_head x d_head
+        # Shape n_head x d_head
+        r_s_bias = self.r_s_bias * self.scale
+
+        # Shape batch_size x n_head x seq_len x 2
+        token_type_bias = tf.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed)
+        # Shape batch_size x n_head x seq_len x context_len
+        token_type_mat = tf.tile(token_type_mat[:, None], [1, shape_list(q_head)[2], 1, 1])
+        # token_type_mat = tf.broadcast_to(token_type_mat[:, None], new_shape)
+        # Shapes batch_size x n_head x seq_len
+        diff_token_type, same_token_type = tf.split(token_type_bias, 2, axis=-1)
+        # Shape batch_size x n_head x seq_len x context_len
+        token_type_attn = tf.where(
+            token_type_mat,
+            tf.tile(same_token_type, [1, 1, 1, context_len]),
+            tf.tile(diff_token_type, [1, 1, 1, context_len]),
+        )
+
+        if cls_mask is not None:
+            token_type_attn *= cls_mask
+        return token_type_attn
+
+    def call(self, query, key, value, attention_inputs, output_attentions=False, training=False):
+        # query has shape batch_size x seq_len x d_model
+        # key and value have shapes batch_size x context_len x d_model
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+
+        batch_size, seq_len, _ = shape_list(query)
+        context_len = shape_list(key)[1]
+        n_head, d_head = self.n_head, self.d_head
+
+        # Shape batch_size x seq_len x n_head x d_head
+        q_head = tf.reshape(self.q_head(query), [batch_size, seq_len, n_head, d_head])
+        # Shapes batch_size x context_len x n_head x d_head
+        k_head = tf.reshape(self.k_head(key), [batch_size, context_len, n_head, d_head])
+        v_head = tf.reshape(self.v_head(value), [batch_size, context_len, n_head, d_head])
+
+        q_head = q_head * self.scale
+        # Shape n_head x d_head
+        r_w_bias = self.r_w_bias * self.scale
+        # Shapes batch_size x n_head x seq_len x context_len
+        content_score = tf.einsum("bind,bjnd->bnij", q_head + r_w_bias, k_head)
+        positional_attn = self.relative_positional_attention(position_embeds, q_head, context_len, cls_mask)
+        token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)
+
+        # merge attention scores
+        attn_score = content_score + positional_attn + token_type_attn
+
+        # perform masking
+        if attention_mask is not None:
+            attention_mask = tf.cast(attention_mask, dtype=attn_score.dtype)
+            attn_score = attn_score - (INF * (1 - attention_mask[:, None, None]))
+
+        # attention probability
+        attn_prob = stable_softmax(attn_score, axis=-1)
+        attn_prob = self.attention_dropout(attn_prob, training=training)
+
+        # attention output, shape batch_size x seq_len x n_head x d_head
+        attn_vec = tf.einsum("bnij,bjnd->bind", attn_prob, v_head)
+
+        # Shape shape batch_size x seq_len x d_model
+        attn_out = self.post_proj(tf.reshape(attn_vec, [batch_size, seq_len, n_head * d_head]))
+        attn_out = self.hidden_dropout(attn_out, training=training)
+
+        output = self.layer_norm(query + attn_out)
+        return (output, attn_prob) if output_attentions else (output,)
+
+
+class TFFunnelPositionwiseFFN(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        initializer = get_initializer(config.initializer_range)
+        self.linear_1 = keras.layers.Dense(config.d_inner, kernel_initializer=initializer, name="linear_1")
+        self.activation_function = get_tf_activation(config.hidden_act)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+        self.linear_2 = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_2")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.config = config
+
+    def call(self, hidden, training=False):
+        h = self.linear_1(hidden)
+        h = self.activation_function(h)
+        h = self.activation_dropout(h, training=training)
+        h = self.linear_2(h)
+        h = self.dropout(h, training=training)
+        return self.layer_norm(hidden + h)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "linear_1", None) is not None:
+            with tf.name_scope(self.linear_1.name):
+                self.linear_1.build([None, None, self.config.d_model])
+        if getattr(self, "linear_2", None) is not None:
+            with tf.name_scope(self.linear_2.name):
+                self.linear_2.build([None, None, self.config.d_inner])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+
+
+class TFFunnelLayer(keras.layers.Layer):
+    def __init__(self, config, block_index, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFFunnelRelMultiheadAttention(config, block_index, name="attention")
+        self.ffn = TFFunnelPositionwiseFFN(config, name="ffn")
+
+    def call(self, query, key, value, attention_inputs, output_attentions=False, training=False):
+        attn = self.attention(
+            query, key, value, attention_inputs, output_attentions=output_attentions, training=training
+        )
+        output = self.ffn(attn[0], training=training)
+        return (output, attn[1]) if output_attentions else (output,)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "ffn", None) is not None:
+            with tf.name_scope(self.ffn.name):
+                self.ffn.build(None)
+
+
+class TFFunnelEncoder(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.separate_cls = config.separate_cls
+        self.pool_q_only = config.pool_q_only
+        self.block_repeats = config.block_repeats
+        self.attention_structure = TFFunnelAttentionStructure(config)
+        self.blocks = [
+            [TFFunnelLayer(config, block_index, name=f"blocks_._{block_index}_._{i}") for i in range(block_size)]
+            for block_index, block_size in enumerate(config.block_sizes)
+        ]
+
+    def call(
+        self,
+        inputs_embeds,
+        attention_mask=None,
+        token_type_ids=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        training=False,
+    ):
+        # The pooling is not implemented on long tensors, so we convert this mask.
+        # attention_mask = tf.cast(attention_mask, inputs_embeds.dtype)
+        attention_inputs = self.attention_structure.init_attention_inputs(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            training=training,
+        )
+        hidden = inputs_embeds
+
+        all_hidden_states = (inputs_embeds,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for block_index, block in enumerate(self.blocks):
+            pooling_flag = shape_list(hidden)[1] > (2 if self.separate_cls else 1)
+            pooling_flag = pooling_flag and block_index > 0
+            pooled_hidden = tf.zeros(shape_list(hidden))
+
+            if pooling_flag:
+                pooled_hidden, attention_inputs = self.attention_structure.pre_attention_pooling(
+                    hidden, attention_inputs
+                )
+
+            for layer_index, layer in enumerate(block):
+                for repeat_index in range(self.block_repeats[block_index]):
+                    do_pooling = (repeat_index == 0) and (layer_index == 0) and pooling_flag
+                    if do_pooling:
+                        query = pooled_hidden
+                        key = value = hidden if self.pool_q_only else pooled_hidden
+                    else:
+                        query = key = value = hidden
+                    layer_output = layer(
+                        query, key, value, attention_inputs, output_attentions=output_attentions, training=training
+                    )
+                    hidden = layer_output[0]
+                    if do_pooling:
+                        attention_inputs = self.attention_structure.post_attention_pooling(attention_inputs)
+
+                    if output_attentions:
+                        all_attentions = all_attentions + layer_output[1:]
+                    if output_hidden_states:
+                        all_hidden_states = all_hidden_states + (hidden,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        for block in self.blocks:
+            for layer in block:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False):
+    """
+    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
+    """
+    if stride == 1:
+        return x
+    if separate_cls:
+        cls = x[:, :1]
+        x = x[:, 1:]
+    output = tf.repeat(x, repeats=stride, axis=1)
+    if separate_cls:
+        if truncate_seq:
+            output = tf.pad(output, [[0, 0], [0, stride - 1], [0, 0]])
+        output = output[:, : target_len - 1]
+        output = tf.concat([cls, output], axis=1)
+    else:
+        output = output[:, :target_len]
+    return output
+
+
+class TFFunnelDecoder(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.separate_cls = config.separate_cls
+        self.truncate_seq = config.truncate_seq
+        self.stride = 2 ** (len(config.block_sizes) - 1)
+        self.attention_structure = TFFunnelAttentionStructure(config)
+        self.layers = [TFFunnelLayer(config, 0, name=f"layers_._{i}") for i in range(config.num_decoder_layers)]
+
+    def call(
+        self,
+        final_hidden,
+        first_block_hidden,
+        attention_mask=None,
+        token_type_ids=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        training=False,
+    ):
+        upsampled_hidden = upsample(
+            final_hidden,
+            stride=self.stride,
+            target_len=shape_list(first_block_hidden)[1],
+            separate_cls=self.separate_cls,
+            truncate_seq=self.truncate_seq,
+        )
+
+        hidden = upsampled_hidden + first_block_hidden
+        all_hidden_states = (hidden,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        attention_inputs = self.attention_structure.init_attention_inputs(
+            hidden,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            training=training,
+        )
+
+        for layer in self.layers:
+            layer_output = layer(
+                hidden, hidden, hidden, attention_inputs, output_attentions=output_attentions, training=training
+            )
+            hidden = layer_output[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + layer_output[1:]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFFunnelBaseLayer(keras.layers.Layer):
+    """Base model without decoder"""
+
+    config_class = FunnelConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+
+        self.embeddings = TFFunnelEmbeddings(config, name="embeddings")
+        self.encoder = TFFunnelEncoder(config, name="encoder")
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids, training=training)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return encoder_outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+
+
+@keras_serializable
+class TFFunnelMainLayer(keras.layers.Layer):
+    """Base model with decoder"""
+
+    config_class = FunnelConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.block_sizes = config.block_sizes
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+
+        self.embeddings = TFFunnelEmbeddings(config, name="embeddings")
+        self.encoder = TFFunnelEncoder(config, name="encoder")
+        self.decoder = TFFunnelDecoder(config, name="decoder")
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids, training=training)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        decoder_outputs = self.decoder(
+            final_hidden=encoder_outputs[0],
+            first_block_hidden=encoder_outputs[1][self.block_sizes[0]],
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            idx = 0
+            outputs = (decoder_outputs[0],)
+            if output_hidden_states:
+                idx += 1
+                outputs = outputs + (encoder_outputs[1] + decoder_outputs[idx],)
+            if output_attentions:
+                idx += 1
+                outputs = outputs + (encoder_outputs[2] + decoder_outputs[idx],)
+            return outputs
+
+        return TFBaseModelOutput(
+            last_hidden_state=decoder_outputs[0],
+            hidden_states=(encoder_outputs.hidden_states + decoder_outputs.hidden_states)
+            if output_hidden_states
+            else None,
+            attentions=(encoder_outputs.attentions + decoder_outputs.attentions) if output_attentions else None,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+
+
+class TFFunnelDiscriminatorPredictions(keras.layers.Layer):
+    """Prediction module for the discriminator, made up of two dense layers."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        initializer = get_initializer(config.initializer_range)
+        self.dense = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="dense")
+        self.activation_function = get_tf_activation(config.hidden_act)
+        self.dense_prediction = keras.layers.Dense(1, kernel_initializer=initializer, name="dense_prediction")
+        self.config = config
+
+    def call(self, discriminator_hidden_states):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = self.activation_function(hidden_states)
+        logits = tf.squeeze(self.dense_prediction(hidden_states))
+        return logits
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.d_model])
+        if getattr(self, "dense_prediction", None) is not None:
+            with tf.name_scope(self.dense_prediction.name):
+                self.dense_prediction.build([None, None, self.config.d_model])
+
+
+class TFFunnelMaskedLMHead(keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states, training=False):
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+class TFFunnelClassificationHead(keras.layers.Layer):
+    def __init__(self, config, n_labels, **kwargs):
+        super().__init__(**kwargs)
+        initializer = get_initializer(config.initializer_range)
+        self.linear_hidden = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_hidden")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.linear_out = keras.layers.Dense(n_labels, kernel_initializer=initializer, name="linear_out")
+        self.config = config
+
+    def call(self, hidden, training=False):
+        hidden = self.linear_hidden(hidden)
+        hidden = keras.activations.tanh(hidden)
+        hidden = self.dropout(hidden, training=training)
+        return self.linear_out(hidden)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "linear_hidden", None) is not None:
+            with tf.name_scope(self.linear_hidden.name):
+                self.linear_hidden.build([None, None, self.config.d_model])
+        if getattr(self, "linear_out", None) is not None:
+            with tf.name_scope(self.linear_out.name):
+                self.linear_out.build([None, None, self.config.d_model])
+
+
+class TFFunnelPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = FunnelConfig
+    base_model_prefix = "funnel"
+
+    @property
+    def dummy_inputs(self):
+        # Funnel misbehaves with very small inputs, so we override and make them a bit bigger
+        return {"input_ids": tf.ones((1, 3), dtype=tf.int32)}
+
+
+@dataclass
+class TFFunnelForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`FunnelForPreTraining`].
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Prediction scores of the head (scores for each token before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+FUNNEL_START_DOCSTRING = r"""
+
+    The Funnel Transformer model was proposed in [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
+    Language Processing](https://huggingface.co/papers/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`XxxConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+FUNNEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    """
+    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
+    decoder) or any task-specific head on top.
+    """,
+    FUNNEL_START_DOCSTRING,
+)
+class TFFunnelBaseModel(TFFunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.funnel = TFFunnelBaseLayer(config, name="funnel")
+
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="funnel-transformer/small-base",
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFBaseModelOutput:
+        return self.funnel(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+    def serving_output(self, output):
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFBaseModelOutput(
+            last_hidden_state=output.last_hidden_state,
+            hidden_states=output.hidden_states,
+            attentions=output.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+
+
+@add_start_docstrings(
+    "The bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.",
+    FUNNEL_START_DOCSTRING,
+)
+class TFFunnelModel(TFFunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.funnel = TFFunnelMainLayer(config, name="funnel")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="funnel-transformer/small",
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFBaseModelOutput:
+        return self.funnel(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+    def serving_output(self, output):
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFBaseModelOutput(
+            last_hidden_state=output.last_hidden_state,
+            hidden_states=output.hidden_states,
+            attentions=output.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+
+
+@add_start_docstrings(
+    """
+    Funnel model with a binary classification head on top as used during pretraining for identifying generated tokens.
+    """,
+    FUNNEL_START_DOCSTRING,
+)
+class TFFunnelForPreTraining(TFFunnelPreTrainedModel):
+    def __init__(self, config: FunnelConfig, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+
+        self.funnel = TFFunnelMainLayer(config, name="funnel")
+        self.discriminator_predictions = TFFunnelDiscriminatorPredictions(config, name="discriminator_predictions")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFFunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> tuple[tf.Tensor] | TFFunnelForPreTrainingOutput:
+        r"""
+                        Returns:
+
+                        Examples:
+
+                        ```python
+                        >>> from transformers import AutoTokenizer, TFFunnelForPreTraining
+                        >>> import torch
+        from ...utils.deprecation import deprecate_kwarg
+        from ...utils.deprecation import deprecate_kwarg
+        from ...utils.deprecation import deprecate_kwarg
+                from ...utils.deprecation import deprecate_kwarg
+
+                        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
+                        >>> model = TFFunnelForPreTraining.from_pretrained("funnel-transformer/small")
+
+                        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+                        >>> logits = model(inputs).logits
+                        ```"""
+        discriminator_hidden_states = self.funnel(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+
+        if not return_dict:
+            return (logits,) + discriminator_hidden_states[1:]
+
+        return TFFunnelForPreTrainingOutput(
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+    def serving_output(self, output):
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFFunnelForPreTrainingOutput(
+            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+        if getattr(self, "discriminator_predictions", None) is not None:
+            with tf.name_scope(self.discriminator_predictions.name):
+                self.discriminator_predictions.build(None)
+
+
+@add_start_docstrings("""Funnel Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
+class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss):
+    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+
+        self.funnel = TFFunnelMainLayer(config, name="funnel")
+        self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head")
+
+    def get_lm_head(self) -> TFFunnelMaskedLMHead:
+        return self.lm_head
+
+    def get_prefix_bias_name(self) -> str:
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="funnel-transformer/small",
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFMaskedLMOutput:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.funnel(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output, training=training)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFMaskedLMOutput(logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+@add_start_docstrings(
+    """
+    Funnel Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    FUNNEL_START_DOCSTRING,
+)
+class TFFunnelForSequenceClassification(TFFunnelPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.funnel = TFFunnelBaseLayer(config, name="funnel")
+        self.classifier = TFFunnelClassificationHead(config, config.num_labels, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="funnel-transformer/small-base",
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFSequenceClassifierOutput:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.funnel(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        last_hidden_state = outputs[0]
+        pooled_output = last_hidden_state[:, 0]
+        logits = self.classifier(pooled_output, training=training)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFSequenceClassifierOutput(
+            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    Funnel Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    FUNNEL_START_DOCSTRING,
+)
+class TFFunnelForMultipleChoice(TFFunnelPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+
+        self.funnel = TFFunnelBaseLayer(config, name="funnel")
+        self.classifier = TFFunnelClassificationHead(config, 1, name="classifier")
+
+    @property
+    def dummy_inputs(self):
+        return {"input_ids": tf.ones((3, 3, 4), dtype=tf.int32)}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="funnel-transformer/small-base",
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFMultipleChoiceModelOutput:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.funnel(
+            flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        last_hidden_state = outputs[0]
+        pooled_output = last_hidden_state[:, 0]
+        logits = self.classifier(pooled_output, training=training)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFMultipleChoiceModelOutput(
+            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    Funnel Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FUNNEL_START_DOCSTRING,
+)
+class TFFunnelForTokenClassification(TFFunnelPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.funnel = TFFunnelMainLayer(config, name="funnel")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="funnel-transformer/small",
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFTokenClassifierOutput:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.funnel(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFTokenClassifierOutput(
+            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    Funnel Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FUNNEL_START_DOCSTRING,
+)
+class TFFunnelForQuestionAnswering(TFFunnelPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.funnel = TFFunnelMainLayer(config, name="funnel")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="funnel-transformer/small",
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFQuestionAnsweringModelOutput:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions, "end_position": end_positions}
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
+        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
+        # different dimensions
+        return TFQuestionAnsweringModelOutput(
+            start_logits=output.start_logits,
+            end_logits=output.end_logits,
+            hidden_states=output.hidden_states,
+            attentions=output.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "funnel", None) is not None:
+            with tf.name_scope(self.funnel.name):
+                self.funnel.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFFunnelBaseModel",
+    "TFFunnelForMaskedLM",
+    "TFFunnelForMultipleChoice",
+    "TFFunnelForPreTraining",
+    "TFFunnelForQuestionAnswering",
+    "TFFunnelForSequenceClassification",
+    "TFFunnelForTokenClassification",
+    "TFFunnelModel",
+    "TFFunnelPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/tokenization_funnel.py b/phivenv/Lib/site-packages/transformers/models/funnel/tokenization_funnel.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5d44e5e59064315ca330b6d9d7d0ffd04c59b12
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/funnel/tokenization_funnel.py
@@ -0,0 +1,542 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Funnel Transformer."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+_model_names = [
+    "small",
+    "small-base",
+    "medium",
+    "medium-base",
+    "intermediate",
+    "intermediate-base",
+    "large",
+    "large-base",
+    "xlarge",
+    "xlarge-base",
+]
+
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FunnelTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a Funnel Transformer tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"<sep>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"<cls>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    cls_token_type_id: int = 2
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        bos_token="<s>",
+        eos_token="</s>",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        clean_up_tokenization_spaces=True,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = FunnelTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    @property
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.vocab)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
+    def _tokenize(self, text, split_special_tokens=False):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
+        Transformer sequence pair mask has the following format:
+
+        ```
+        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
+        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["FunnelTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/funnel/tokenization_funnel_fast.py b/phivenv/Lib/site-packages/transformers/models/funnel/tokenization_funnel_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeeb6f7bf6cb0640ee04bb01737331ba4be1233b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/funnel/tokenization_funnel_fast.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Funnel Transformer."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_funnel import FunnelTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+_model_names = [
+    "small",
+    "small-base",
+    "medium",
+    "medium-base",
+    "intermediate",
+    "intermediate-base",
+    "large",
+    "large-base",
+    "xlarge",
+    "xlarge-base",
+]
+
+
+class FunnelTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"<sep>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"<cls>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        bos_token (`str`, `optional`, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, `optional`, defaults to `"</s>"`):
+            The end of sentence token.
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = FunnelTokenizer
+    cls_token_type_id: int = 2
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        bos_token="<s>",
+        eos_token="</s>",
+        clean_text=True,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        wordpieces_prefix="##",
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            clean_text=clean_text,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            wordpieces_prefix=wordpieces_prefix,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens with BERT->Funnel
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A Funnel sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
+        Transformer sequence pair mask has the following format:
+
+        ```
+        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
+        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["FunnelTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/__init__.py b/phivenv/Lib/site-packages/transformers/models/fuyu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2a7d252010e00ec7e3192520ac401b200dc1da9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fuyu/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_fuyu import *
+    from .image_processing_fuyu import *
+    from .modeling_fuyu import *
+    from .processing_fuyu import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc57ff14269679ce6a810a74b85bc27ae5387974
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c581973b3e799d303cd3e0a89854fd7aee80af3c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c91551b6b780a2a1d1b1e721323ad81941879063
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..268b69967808482e7be53e8bb795c97ac1ea4685
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..800d1ec2939934d40241aad4a21d39bd61a9c435
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/configuration_fuyu.py b/phivenv/Lib/site-packages/transformers/models/fuyu/configuration_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..40da84e2e780821f26765333a2cee51030e0bea4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fuyu/configuration_fuyu.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fuyu model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class FuyuConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
+    Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262144):
+            Vocabulary size of the Fuyu model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FuyuForCausalLM`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 16384):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 36):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 16384):
+            The maximum sequence length that this model might ever be used with.
+        image_size (`int`, *optional*, defaults to 300):
+            The input image size.
+        patch_size (`int`, *optional*, defaults to 30):
+            The input vision transformer encoding patch size.
+        num_channels (`int`, *optional*, defaults to 3):
+            The input image number of channels.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        rope_theta (`float`, *optional*, defaults to 25000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        qk_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the Queries and Keys after projecting the hidden states
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after applying the MLP to the hidden states.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the *beginning-of-sequence* token.
+        eos_token_id (`Union[int, list[int]]`, *optional*, defaults to 2):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        image_token_id (`int`, *optional*, defaults to 71011):
+            The id of the image placeholder token.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize the `language``[`Aut`].
+
+    ```python
+    >>> from transformers import FuyuConfig
+
+    >>> # Initializing a Fuyu fuyu-7b style configuration
+    >>> configuration = FuyuConfig()
+    ```"""
+
+    model_type = "fuyu"
+    sub_configs = {"text_config": AutoConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=262144,
+        hidden_size=4096,
+        intermediate_size=16384,
+        num_hidden_layers=36,
+        num_attention_heads=64,
+        hidden_act="relu2",
+        max_position_embeddings=16384,
+        image_size=300,
+        patch_size=30,
+        num_channels=3,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=25000.0,
+        rope_scaling=None,
+        qk_layernorm=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        partial_rotary_factor=0.5,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        image_token_id=71011,
+        text_config=None,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = {
+                "vocab_size": vocab_size,
+                "max_position_embeddings": max_position_embeddings,
+                "hidden_size": hidden_size,
+                "intermediate_size": intermediate_size,
+                "num_hidden_layers": num_hidden_layers,
+                "num_attention_heads": num_attention_heads,
+                "hidden_act": hidden_act,
+                "initializer_range": initializer_range,
+                "layer_norm_eps": layer_norm_eps,
+                "use_cache": use_cache,
+                "rope_theta": rope_theta,
+                "rope_scaling": rope_scaling,
+                "qk_layernorm": qk_layernorm,
+                "hidden_dropout": hidden_dropout,
+                "attention_dropout": attention_dropout,
+                "partial_rotary_factor": partial_rotary_factor,
+                "pad_token_id": pad_token_id,
+                "bos_token_id": bos_token_id,
+                "eos_token_id": eos_token_id,
+                "tie_word_embeddings": tie_word_embeddings,
+            }
+            logger.info("text_config is None. initializing the text model with default values.")
+        text_model_type = text_config.get("model_type", "persimmon")
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self._vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.qk_layernorm = qk_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.partial_rotary_factor = partial_rotary_factor
+        self.image_token_id = image_token_id
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+
+__all__ = ["FuyuConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/image_processing_fuyu.py b/phivenv/Lib/site-packages/transformers/models/fuyu/image_processing_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..bab6e3b47b6d7c489aa2d5853bc8043f1b8580fd
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fuyu/image_processing_fuyu.py
@@ -0,0 +1,724 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Fuyu."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    pad,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    is_torch_device,
+    is_torch_dtype,
+    logging,
+    requires_backends,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+def make_list_of_list_of_images(
+    images: Union[list[list[ImageInput]], list[ImageInput], ImageInput],
+) -> list[list[ImageInput]]:
+    if is_valid_image(images):
+        return [[images]]
+
+    if isinstance(images, list) and all(isinstance(image, list) for image in images):
+        return images
+
+    if isinstance(images, list):
+        return [make_list_of_images(image) for image in images]
+
+    raise ValueError("images must be a list of list of images or a list of images or an image.")
+
+
+class FuyuBatchFeature(BatchFeature):
+    """
+    BatchFeature class for Fuyu image processor and processor.
+
+    The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
+    """
+
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`~utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+                `None`, no modification is done.
+        """
+        if tensor_type is None:
+            return self
+
+        is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type=tensor_type)
+
+        def _convert_tensor(elem):
+            if is_tensor(elem):
+                return elem
+            return as_tensor(elem)
+
+        def _safe_convert_tensor(elem):
+            try:
+                return _convert_tensor(elem)
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            if isinstance(value, list) and isinstance(value[0], list):
+                # list[list[Any]] -> list[list[Tensor]]
+                self[key] = [[_safe_convert_tensor(elem) for elem in elems] for elems in value]
+            elif isinstance(value, list):
+                # list[Any] -> list[Tensor]
+                self[key] = [_safe_convert_tensor(elem) for elem in value]
+            else:
+                # Any -> Tensor
+                self[key] = _safe_convert_tensor(value)
+        return self
+
+    def to(self, *args, **kwargs) -> "BatchFeature":
+        """
+        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+        different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+        Args:
+            args (`Tuple`):
+                Will be passed to the `to(...)` function of the tensors.
+            kwargs (`Dict`, *optional*):
+                Will be passed to the `to(...)` function of the tensors.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+        requires_backends(self, ["torch"])
+        import torch  # noqa
+
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+
+        def _to(elem):
+            # check if v is a floating point
+            if torch.is_floating_point(elem):
+                # cast and send to device
+                return elem.to(*args, **kwargs)
+            if device is not None:
+                return elem.to(device=device)
+
+            return elem
+
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            if isinstance(v, list) and isinstance(v[0], list):
+                # Data structure is a list of lists
+                new_v = []
+                for elems in v:
+                    new_v.append([_to(elem) for elem in elems])
+                new_data[k] = new_v
+            elif isinstance(v, list):
+                # Data structure is a list
+                new_data[k] = [_to(elem) for elem in v]
+            else:
+                new_data[k] = _to(v)
+        self.data = new_data
+        return self
+
+
+class FuyuImageProcessor(BaseImageProcessor):
+    """
+    This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
+    handle:
+
+    - Processing Images:
+        Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
+        dimensions. The image output is always img_h, img_w of (1080, 1920)
+
+        Then, it patches up these images using the patchify_image function.
+
+    - Creating Image Input IDs:
+        For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
+        variable-sized images, each line of patches is terminated with a newline ID.
+
+    - Image Patch Indices:
+        For each image patch, the code maintains an index where these patches should be inserted in a token stream.
+
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image to `size`.
+        size (`dict[str, int]`, *optional*, defaults to `{"height": 1080, "width": 1920}`):
+            Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to `size`.
+        padding_value (`float`, *optional*, defaults to 1.0):
+            The value to pad the image with.
+        padding_mode (`str`, *optional*, defaults to `"constant"`):
+            The padding mode to use when padding the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float`, *optional*, defaults to 0.5):
+            The mean to use when normalizing the image.
+        image_std (`float`, *optional*, defaults to 0.5):
+            The standard deviation to use when normalizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image.
+        rescale_factor (`float`, *optional*, defaults to `1 / 255`):
+            The factor to use when rescaling the image.
+        patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
+            Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+    """
+
+    model_input_names = [
+        "images",
+        "image_input_ids",
+        "image_patches",
+        "image_patch_indices_per_batch",
+        "image_patch_indices_per_subsequence",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_pad: bool = True,
+        padding_value: float = 1.0,
+        padding_mode: str = "constant",
+        do_normalize: bool = True,
+        image_mean: Union[float, list[float]] = 0.5,
+        image_std: Union[float, list[float]] = 0.5,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        patch_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"height": 1080, "width": 1920}
+        self.resample = resample
+        self.do_pad = do_pad
+        self.padding_value = padding_value
+        self.padding_mode = padding_mode
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        image_height, image_width = get_image_size(image, input_data_format)
+        target_height, target_width = size["height"], size["width"]
+
+        if image_width <= target_width and image_height <= target_height:
+            return image
+
+        height_scale_factor = target_height / image_height
+        width_scale_factor = target_width / image_width
+        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+        new_height = int(image_height * optimal_scale_factor)
+        new_width = int(image_width * optimal_scale_factor)
+
+        scaled_image = resize(
+            image=image,
+            size=(new_height, new_width),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return scaled_image
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        mode: str = "constant",
+        constant_values: float = 1.0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        image_height, image_width = get_image_size(image, input_data_format)
+        target_height, target_width = size["height"], size["width"]
+        padding_top = 0
+        padding_left = 0
+        padding_bottom = target_height - image_height
+        padding_right = target_width - image_width
+        padded_image = pad(
+            image,
+            padding=((padding_top, padding_bottom), (padding_left, padding_right)),
+            mode=mode,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_pad: Optional[bool] = None,
+        padding_value: Optional[float] = None,
+        padding_mode: Optional[str] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[float] = None,
+        image_std: Optional[float] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        patch_size: Optional[dict[str, int]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        return_tensors: Optional[TensorType] = None,
+    ):
+        """
+
+        Utility function to preprocess the images and extract necessary information about original formats.
+
+        Args:
+            images (`ImageInput`):
+                Images to preprocess. Expects a single image, a list or images or a list of lists of images. Pixel
+                values range from 0 to 255, or between 0 and 1 if `do_rescale` is `False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image to `size`.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to `size`.
+            padding_value (`float`, *optional*, defaults to `self.padding_value`):
+                The value to pad the image with.
+            padding_mode (`str`, *optional*, defaults to `self.padding_mode`):
+                The padding mode to use when padding the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float`, *optional*, defaults to `self.image_mean`):
+                The mean to use when normalizing the image.
+            image_std (`float`, *optional*, defaults to `self.image_std`):
+                The standard deviation to use when normalizing the image.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                The factor to use when rescaling the image.
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format of the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        padding_value = padding_value if padding_value is not None else self.padding_value
+        padding_mode = padding_mode if padding_mode is not None else self.padding_mode
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        patch_size = patch_size if patch_size is not None else self.patch_size
+
+        if isinstance(images, list) and any(isinstance(elem, list) and len(elem) >= 2 for elem in images):
+            raise ValueError("Multiple images for a single sample are not yet supported.")
+
+        batch_images = make_list_of_list_of_images(images)
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_pad=do_pad,
+            size_divisibility=size,  # There is no pad divisibility in this processor, but pad requires the size arg.
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        batch_images = [[to_numpy_array(image) for image in images] for images in batch_images]
+
+        if do_rescale and is_scaled_image(batch_images[0][0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(batch_images[0][0])
+
+        original_image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+        size = get_size_dict(size)  # for BC
+
+        if do_resize:
+            batch_images = [
+                [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+                for images in batch_images
+            ]
+
+        image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+
+        # scale_h is the same as scale_w
+        image_scale_factors = [
+            [resized_size[0] / original_size[0]]
+            for original_size, resized_size in zip(original_image_sizes, image_sizes)
+        ]
+
+        if do_pad:
+            batch_images = [
+                [
+                    self.pad_image(
+                        image,
+                        size=size,
+                        mode=padding_mode,
+                        constant_values=padding_value,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in batch_images
+            ]
+
+        if do_rescale:
+            batch_images = [
+                [self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) for image in images]
+                for images in batch_images
+            ]
+
+        if do_normalize:
+            batch_images = [
+                [
+                    self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in batch_images
+            ]
+
+        if data_format is not None:
+            batch_images = [
+                [to_channel_dimension_format(image, data_format, input_data_format) for image in images]
+                for images in batch_images
+            ]
+
+        data = {
+            "images": batch_images,
+            "image_unpadded_heights": image_unpadded_heights,
+            "image_unpadded_widths": image_unpadded_widths,
+            "image_scale_factors": image_scale_factors,
+        }
+        return FuyuBatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_num_patches(self, image_height: int, image_width: int, patch_size: Optional[dict[str, int]] = None) -> int:
+        """
+        Calculate number of patches required to encode an image.
+
+        Args:
+            image_height (`int`):
+                Height of the image.
+            image_width (`int`):
+                Width of the image.
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+
+        if image_height % patch_height != 0:
+            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+        if image_width % patch_width != 0:
+            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+
+        num_patches_per_dim_h = image_height // patch_height
+        num_patches_per_dim_w = image_width // patch_width
+        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+        return num_patches
+
+    def patchify_image(self, image: "torch.Tensor", patch_size: Optional[dict[str, int]] = None) -> "torch.Tensor":
+        """
+        Convert an image into a tensor of patches.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to convert. Shape: [batch, channels, height, width]
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        requires_backends(self, ["torch"])
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+
+        # TODO refer to https://github.com/ArthurZucker/transformers/blob/0f0a3fe5ca5697ee58faeb5b53f049af720b5e98/src/transformers/models/vit_mae/modeling_vit_mae.py#L871
+        # torch implementation is faster but does not handle non-squares
+
+        batch_size, channels, _, _ = image.shape
+        unfolded_along_height = image.unfold(2, patch_height, patch_height)
+        patches = unfolded_along_height.unfold(3, patch_width, patch_width)
+        patches = patches.contiguous()
+        patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
+        patches = patches.permute(0, 2, 3, 4, 1)
+        patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
+        return patches
+
+    def preprocess_with_tokenizer_info(
+        self,
+        image_input: "torch.Tensor",
+        image_present: "torch.Tensor",
+        image_unpadded_h: "torch.Tensor",
+        image_unpadded_w: "torch.Tensor",
+        image_placeholder_id: int,
+        image_newline_id: int,
+        variable_sized: bool,
+        patch_size: Optional[dict[str, int]] = None,
+    ) -> FuyuBatchFeature:
+        """Process images for model input. In particular, variable-sized images are handled here.
+
+        Args:
+            image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
+                Tensor of images padded to model input size.
+            image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
+                Tensor of 1s and 0s indicating whether an image is present.
+            image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image heights.
+            image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image widths.
+            image_placeholder_id (int):
+                The id of the image placeholder token. Comes from an associated tokenizer.
+            image_newline_id (int):
+                The id of the image newline token. Comes from an associated tokenizer.
+            variable_sized (bool):
+                Whether to process images as variable-sized.
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Size of the patches.
+        """
+        requires_backends(self, ["torch"])
+
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+
+        # Only images that are present.
+        images: list[list[torch.Tensor]] = []
+        batch_image_patches: list[list[torch.Tensor]] = []
+        # Image input ids for every subsequence, including ones with no image present.
+        batch_image_input_ids: list[list[torch.Tensor]] = []
+        for batch_index in range(image_input.shape[0]):
+            image_input_ids = []
+            image_patches = []
+            for subseq_index in range(image_input.shape[1]):
+                if image_present[batch_index, subseq_index]:
+                    image = image_input[batch_index, subseq_index]
+                    image_height, image_width = image.shape[1], image.shape[2]
+                    if variable_sized:
+                        # The min() is required here due to floating point issues:
+                        # math.ceil(torch.tensor(300).cuda() / 30) == 11
+                        new_h = min(
+                            image_height,
+                            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+                        )
+                        new_w = min(
+                            image_width,
+                            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+                        )
+                        image = image[:, :new_h, :new_w]
+                        image_height, image_width = new_h, new_w
+
+                    num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
+                    tensor_of_image_ids = torch.full(
+                        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+                    )
+                    patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+                    assert num_patches == patches.shape[0]
+
+                    if variable_sized:
+                        # Now terminate each line with |NEWLINE|.
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
+                        newline_ids = torch.full(
+                            [tensor_of_image_ids.shape[0], 1],
+                            image_newline_id,
+                            dtype=torch.int32,
+                            device=image_input.device,
+                        )
+                        tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
+
+                    images.append([image])
+                    image_input_ids.append(tensor_of_image_ids)
+                    image_patches.append(patches)
+                else:
+                    image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
+
+            batch_image_input_ids.append(image_input_ids)
+            batch_image_patches.append(image_patches)
+
+        # Create image_patch_input_indices, where non-negative values correspond to image patches to be inserted in
+        # the stream.
+        image_patch_indices_per_batch: list[list[torch.Tensor]] = []
+        image_patch_indices_per_subsequence: list[list[torch.Tensor]] = []
+
+        for sample_image_input_ids in batch_image_input_ids:
+            index_offset = 0
+            per_batch_indices = []
+            per_subsequence_indices = []
+            for subseq_image_input_ids in sample_image_input_ids:
+                # Indices of image patches.
+                patches_mask = subseq_image_input_ids == image_placeholder_id
+                num_patches = torch.count_nonzero(patches_mask)
+                indices = torch.arange(num_patches, dtype=torch.int64, device=subseq_image_input_ids.device).type_as(
+                    subseq_image_input_ids
+                )
+
+                # Place those indices in the image input ids token stream, with -1 representing non-index tokens.
+                indices_in_stream_per_batch = torch.full_like(subseq_image_input_ids, -1)
+                indices_in_stream_per_subsequence = torch.full_like(subseq_image_input_ids, -1)
+                patches_inds = torch.nonzero(patches_mask, as_tuple=True)[0]
+
+                indices_in_stream_per_batch[patches_inds] = indices + index_offset
+                indices_in_stream_per_subsequence[patches_inds] = indices
+
+                per_batch_indices.append(indices_in_stream_per_batch)
+                per_subsequence_indices.append(indices_in_stream_per_subsequence)
+                index_offset += num_patches
+
+            image_patch_indices_per_batch.append(per_batch_indices)
+            image_patch_indices_per_subsequence.append(per_subsequence_indices)
+
+        return FuyuBatchFeature(
+            data={
+                "images": images,
+                "image_input_ids": batch_image_input_ids,
+                "image_patches": batch_image_patches,
+                "image_patch_indices_per_batch": image_patch_indices_per_batch,
+                "image_patch_indices_per_subsequence": image_patch_indices_per_subsequence,
+            }
+        )
+
+
+__all__ = ["FuyuImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/modeling_fuyu.py b/phivenv/Lib/site-packages/transformers/models/fuyu/modeling_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e13813f349e878f7033290c36ff3cef9daa45a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fuyu/modeling_fuyu.py
@@ -0,0 +1,406 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Fuyu model."""
+
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...models.auto.modeling_auto import AutoModel
+from ...utils import auto_docstring, can_return_tuple, logging
+from .configuration_fuyu import FuyuConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class FuyuPreTrainedModel(PreTrainedModel):
+    config: FuyuConfig
+    base_model_prefix = "fuyu"
+    supports_gradient_checkpointing = True
+    _supports_attention_backend = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class FuyuModel(FuyuPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: FuyuConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.vision_embed_tokens = nn.Linear(
+            config.patch_size * config.patch_size * config.num_channels, config.hidden_size
+        )
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def gather_continuous_embeddings(
+        self,
+        word_embeddings: torch.Tensor,
+        continuous_embeddings: list[torch.Tensor],
+        image_patch_input_indices: torch.Tensor,
+    ) -> torch.Tensor:
+        """This function places the continuous_embeddings into the word_embeddings at the locations
+        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
+        embeddings.
+
+        Args:
+            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Tensor of word embeddings.
+            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
+                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
+                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
+                indices in image_patch_input_indices for that batch element.
+            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Tensor of indices of the image patches in the input_ids tensor.
+        """
+        if not (word_embeddings.shape[0] == len(continuous_embeddings)):
+            raise ValueError(
+                f"Batch sizes must match! Got {len(continuous_embeddings)=} and {word_embeddings.shape[0]=}"
+            )
+
+        output_embeddings = word_embeddings.clone()
+        for batch_idx in range(word_embeddings.shape[0]):
+            # First, find the positions of all the non-negative values in image_patch_input_indices, those are the
+            # positions in word_embeddings that we want to replace with content from continuous_embeddings.
+            dst_indices = torch.nonzero(image_patch_input_indices[batch_idx] >= 0, as_tuple=True)[0]
+            # Next look up those indices in image_patch_input_indices to find the indices in continuous_embeddings that we
+            # want to use to replace the values in word_embeddings.
+            src_indices = image_patch_input_indices[batch_idx][dst_indices]
+            # Check if we have more indices than embeddings. Note that we could have fewer indices if images got truncated.
+            if src_indices.shape[0] > continuous_embeddings[batch_idx].shape[0]:
+                raise ValueError(
+                    f"Number of continuous embeddings {continuous_embeddings[batch_idx].shape=} does not match "
+                    f"number of continuous token ids {src_indices.shape=} in batch element {batch_idx}."
+                )
+            output_embeddings[batch_idx, dst_indices] = continuous_embeddings[batch_idx][src_indices].to(
+                output_embeddings.device
+            )
+        return output_embeddings
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, **kwargs):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+        """
+        patch_embeddings = [
+            self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)).squeeze(0)
+            for patch in pixel_values
+        ]
+        return patch_embeddings
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        image_patches: torch.Tensor = None,  # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
+        image_patches_indices: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
+            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
+            hidden size of the model.
+        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Tensor of indices of the image patches in the input_ids tensor.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_is or inputs_embeds")
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+        if image_patches is not None:
+            patch_embeddings = self.get_image_features(image_patches)
+            patch_embeddings = torch.cat(patch_embeddings, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=patch_embeddings
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, patch_embeddings)
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return outputs
+
+
+@auto_docstring(
+    custom_intro="""
+    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
+    """
+)
+class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_embed_tokens": "model.vision_embed_tokens",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: FuyuConfig):
+        super().__init__(config)
+        self.model = FuyuModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        image_patches: torch.Tensor = None,  # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
+        image_patches_indices: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
+            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
+            hidden size of the model.
+        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Tensor of indices of the image patches in the input_ids tensor.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import FuyuProcessor, FuyuForCausalLM
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
+        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")
+
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "Generate a coco-style caption.\n"
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
+        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
+        >>> print(generation_text[0])
+        A blue bus parked on the side of a road.
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            image_patches=image_patches,
+            image_patches_indices=image_patches_indices,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=True,
+            # don't pass kwargs because Persimmon-backbone doesn't accept FA2 kwargs yet, TODO: raushan
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        image_patches=None,
+        image_patches_indices=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            image_patches=image_patches,
+            image_patches_indices=image_patches_indices,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        if cache_position[0] != 0:
+            # set image_patches and image_patches_indices to `None` for decoding stage
+            model_inputs["image_patches_indices"] = None
+            model_inputs["image_patches"] = None
+
+        return model_inputs
+
+
+__all__ = ["FuyuForCausalLM", "FuyuPreTrainedModel", "FuyuModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/fuyu/processing_fuyu.py b/phivenv/Lib/site-packages/transformers/models/fuyu/processing_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c75879e8b6910bb7db331e7705578c730d41941
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/fuyu/processing_fuyu.py
@@ -0,0 +1,793 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for GIT
+"""
+
+import re
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_utils import ImageInput
+from ...processing_utils import (
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import is_torch_available, logging, requires_backends
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    from .image_processing_fuyu import FuyuBatchFeature
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+
+TEXT_REPR_BBOX_OPEN = "<box>"
+TEXT_REPR_BBOX_CLOSE = "</box>"
+TEXT_REPR_POINT_OPEN = "<point>"
+TEXT_REPR_POINT_CLOSE = "</point>"
+
+TOKEN_BBOX_OPEN_STRING = "<0x00>"  # <bbox>
+TOKEN_BBOX_CLOSE_STRING = "<0x01>"  # </bbox>
+TOKEN_POINT_OPEN_STRING = "<0x02>"  # <point>
+TOKEN_POINT_CLOSE_STRING = "<0x03>"  # </point>
+BEGINNING_OF_ANSWER_STRING = "<0x04>"  # <boa>
+
+
+class FuyuProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_attention_mask": True,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {},
+    }
+
+
+def full_unpacked_stream_to_tensor(
+    all_bi_tokens_to_place: list[int],
+    full_unpacked_stream: list["torch.Tensor"],
+    fill_value: int,
+    batch_size: int,
+    new_seq_len: int,
+    offset: int,
+) -> "torch.Tensor":
+    """Takes an unpacked stream of tokens (i.e. a list of tensors, one for each item in the batch) and does
+    the required padding to create a single tensor for the batch of shape batch_size x new_seq_len.
+    """
+
+    assert len(all_bi_tokens_to_place) == batch_size
+    assert len(full_unpacked_stream) == batch_size
+
+    # Create padded tensors for the full batch.
+    new_padded_tensor = torch.full(
+        [batch_size, new_seq_len],
+        fill_value=fill_value,
+        dtype=full_unpacked_stream[0].dtype,
+        device=full_unpacked_stream[0].device,
+    )
+
+    # Place each batch entry into the batch tensor.
+    for bi in range(batch_size):
+        tokens_to_place = all_bi_tokens_to_place[bi]
+        new_padded_tensor[bi, :tokens_to_place] = full_unpacked_stream[bi][offset : tokens_to_place + offset]
+
+    return new_padded_tensor
+
+
+def construct_full_unpacked_stream(
+    num_real_text_tokens: Union[list[list[int]], "torch.Tensor"],
+    input_stream: "torch.Tensor",
+    image_tokens: list[list["torch.Tensor"]],
+    batch_size: int,
+    num_sub_sequences: int,
+) -> list["torch.Tensor"]:
+    """Takes an input_stream tensor of shape B x S x ?. For each subsequence, adds any required
+    padding to account for images and then unpacks the subsequences to create a single sequence per item in the batch.
+    Returns a list of tensors, one for each item in the batch."""
+
+    all_bi_stream = []
+
+    for batch_index in range(batch_size):
+        all_si_stream = []
+
+        # First, construct full token stream (including image placeholder tokens) and loss mask for each subsequence
+        # and append to lists. We use lists rather than tensors because each subsequence is variable-sized.
+        # TODO Remove this logic in a subsequent release since subsequences are not supported.
+        image_adjustment = image_tokens[batch_index][0]
+        subsequence_stream = torch.cat([image_adjustment, input_stream[batch_index, 0]], dim=0)
+        num_real_tokens = image_adjustment.shape[0] + num_real_text_tokens[batch_index][0]
+        all_si_stream.append(subsequence_stream[:num_real_tokens])
+        all_bi_stream.append(torch.cat(all_si_stream, dim=0))
+
+    return all_bi_stream
+
+
+def _replace_string_repr_with_token_tags(prompt: str) -> str:
+    prompt = prompt.replace(TEXT_REPR_POINT_OPEN, TOKEN_POINT_OPEN_STRING)
+    prompt = prompt.replace(TEXT_REPR_POINT_CLOSE, TOKEN_POINT_CLOSE_STRING)
+    prompt = prompt.replace(TEXT_REPR_BBOX_OPEN, TOKEN_BBOX_OPEN_STRING)
+    prompt = prompt.replace(TEXT_REPR_BBOX_CLOSE, TOKEN_BBOX_CLOSE_STRING)
+    return prompt
+
+
+def _segment_prompt_into_text_token_conversions(prompt: str) -> list:
+    """
+    Given a string prompt, converts the prompt into a list of TextTokenConversions.
+    """
+    # Wherever, we notice the [TOKEN_OPEN_STRING, TOKEN_CLOSE_STRING], we split the prompt
+    prompt_text_list: list = []
+    regex_pattern = re.compile(
+        f"({TOKEN_BBOX_OPEN_STRING}|{TOKEN_BBOX_CLOSE_STRING}|{TOKEN_POINT_OPEN_STRING}|{TOKEN_POINT_CLOSE_STRING})"
+    )
+    # Split by the regex pattern
+    prompt_split = regex_pattern.split(prompt)
+    for i, elem in enumerate(prompt_split):
+        if len(elem) == 0 or elem in [
+            TOKEN_BBOX_OPEN_STRING,
+            TOKEN_BBOX_CLOSE_STRING,
+            TOKEN_POINT_OPEN_STRING,
+            TOKEN_POINT_CLOSE_STRING,
+        ]:
+            continue
+        prompt_text_list.append(
+            (elem, i > 1 and prompt_split[i - 1] in [TOKEN_BBOX_OPEN_STRING, TOKEN_POINT_OPEN_STRING])
+        )
+    return prompt_text_list
+
+
+def _transform_coordinates_and_tokenize(prompt: str, scale_factor: float, tokenizer) -> list[int]:
+    """
+    This function transforms the prompt in the following fashion:
+    - <box> <point> and </box> </point> to their respective token mappings
+    - extract the coordinates from the tag
+    - transform the coordinates into the transformed image space
+    - return the prompt tokens with the transformed coordinates and new tags
+
+    Bounding boxes and points MUST be in the following format: <box>y1, x1, y2, x2</box> <point>x, y</point> The spaces
+    and punctuation added above are NOT optional.
+    """
+    # Make a namedtuple that stores "text" and "is_bbox"
+
+    # We want to do the following: Tokenize the code normally -> when we see a point or box, tokenize using the tokenize_within_tag function
+    # When point or box close tag, continue tokenizing normally
+    # First, we replace the point and box tags with their respective tokens
+    prompt = _replace_string_repr_with_token_tags(prompt)
+    # Tokenize the prompt
+    # Convert prompt into a list split
+    prompt_text_list = _segment_prompt_into_text_token_conversions(prompt)
+    transformed_prompt_tokens: list[int] = []
+    for elem in prompt_text_list:
+        if elem[1]:
+            # This is a location, we need to tokenize it
+            within_tag_tokenized = _transform_within_tags(elem[0], scale_factor, tokenizer)
+            # Surround the text with the open and close tags
+            transformed_prompt_tokens.extend(within_tag_tokenized)
+        else:
+            transformed_prompt_tokens.extend(tokenizer(elem[0], add_special_tokens=False).input_ids)
+    return transformed_prompt_tokens
+
+
+def _transform_within_tags(text: str, scale_factor: float, tokenizer) -> list[int]:
+    """
+    Given a bounding box of the fashion <box>1, 2, 3, 4</box> | <point>1, 2</point> This function is responsible for
+    converting 1, 2, 3, 4 into tokens of 1 2 3 4 without any commas.
+    """
+    # Convert the text into a list of strings.
+    num_int_strs = text.split(",")
+    if len(num_int_strs) == 2:
+        # If there are any open or close tags, remove them.
+        token_space_open_string = tokenizer.vocab[TOKEN_POINT_OPEN_STRING]
+        token_space_close_string = tokenizer.vocab[TOKEN_POINT_CLOSE_STRING]
+    else:
+        token_space_open_string = tokenizer.vocab[TOKEN_BBOX_OPEN_STRING]
+        token_space_close_string = tokenizer.vocab[TOKEN_BBOX_CLOSE_STRING]
+
+    # Remove all spaces from num_ints
+    num_ints = [float(num.strip()) for num in num_int_strs]
+    # scale to transformed image siz
+    if len(num_ints) == 2:
+        num_ints_translated = scale_point_to_transformed_image(x=num_ints[0], y=num_ints[1], scale_factor=scale_factor)
+    elif len(num_ints) == 4:
+        num_ints_translated = scale_bbox_to_transformed_image(
+            top=num_ints[0],
+            left=num_ints[1],
+            bottom=num_ints[2],
+            right=num_ints[3],
+            scale_factor=scale_factor,
+        )
+    else:
+        raise ValueError(f"Invalid number of ints: {len(num_ints)}")
+    # Tokenize the text, skipping the
+    tokens = [tokenizer.vocab[str(num)] for num in num_ints_translated]
+    return [token_space_open_string] + tokens + [token_space_close_string]
+
+
+def _tokenize_prompts_with_image_and_batch(
+    tokenizer,
+    prompts: list[list[str]],
+    scale_factors: Optional[list[list["torch.Tensor"]]],
+    max_tokens_to_generate: int,
+    max_position_embeddings: int,
+    add_BOS: bool,  # Same issue with types as above
+    add_beginning_of_answer_token: bool,
+) -> tuple["torch.Tensor", "torch.Tensor"]:
+    """
+    Given a set of prompts and number of tokens to generate:
+    - tokenize prompts
+    - set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
+    - pad all the sequences to this length so we can convert them into a 3D tensor.
+    """
+
+    # If not tool use, transform the coordinates while tokenizing
+    if scale_factors is not None:
+        transformed_prompt_tokens = []
+        for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
+            transformed_prompt_tokens.append(
+                [
+                    _transform_coordinates_and_tokenize(prompt, scale_factor.item(), tokenizer)
+                    for prompt, scale_factor in zip(prompt_seq, scale_factor_seq)
+                ]
+            )
+    else:
+        transformed_prompt_tokens = [[tokenizer.tokenize(prompt) for prompt in prompt_seq] for prompt_seq in prompts]
+
+    prompts_tokens = transformed_prompt_tokens
+
+    if add_BOS:
+        bos_token = tokenizer.vocab["<s>"]
+    else:
+        bos_token = tokenizer.vocab["|ENDOFTEXT|"]
+    prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens]
+    if add_beginning_of_answer_token:
+        beginning_of_answer = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
+        # Only add bbox open token to the last subsequence since that is what will be completed
+        for token_seq in prompts_tokens:
+            token_seq[-1].append(beginning_of_answer)
+
+    # Now we have a list of list of tokens which each list has a different
+    # size. We want to extend this list to:
+    #   - incorporate the tokens that need to be generated
+    #   - make all the sequences equal length.
+    # Get the prompts length.
+
+    prompts_length = [[len(x) for x in prompts_tokens_seq] for prompts_tokens_seq in prompts_tokens]
+    # Get the max prompts length.
+    max_prompt_len: int = np.max(prompts_length)
+    # Number of tokens in the each sample of the batch.
+    samples_length = min(max_prompt_len + max_tokens_to_generate, max_position_embeddings)
+    if max_prompt_len + max_tokens_to_generate > max_position_embeddings:
+        logger.warning(
+            f"Max subsequence prompt length of {max_prompt_len} + max tokens to generate {max_tokens_to_generate}",
+            f"exceeds context length of {max_position_embeddings}. Will generate as many tokens as possible.",
+        )
+    # Now update the list of list to be of the same size: samples_length.
+    for prompt_tokens_seq, prompts_length_seq in zip(prompts_tokens, prompts_length):
+        for prompt_tokens, prompt_length in zip(prompt_tokens_seq, prompts_length_seq):
+            if len(prompt_tokens) > samples_length:
+                raise ValueError("Length of subsequence prompt exceeds sequence length.")
+            padding_size = samples_length - prompt_length
+            prompt_tokens.extend([tokenizer.vocab["|ENDOFTEXT|"]] * padding_size)
+
+    # Now we are in a structured format, we can convert to tensors.
+    prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.int64)
+    prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.int64)
+
+    return prompts_tokens_tensor, prompts_length_tensor
+
+
+# Simplified assuming self.crop_top = self.padding_top = 0
+def original_to_transformed_h_coords(original_coords, scale_h):
+    return np.round(original_coords * scale_h).astype(np.int32)
+
+
+# Simplified assuming self.crop_left = self.padding_left = 0
+def original_to_transformed_w_coords(original_coords, scale_w):
+    return np.round(original_coords * scale_w).astype(np.int32)
+
+
+def scale_point_to_transformed_image(x: float, y: float, scale_factor: float) -> list[int]:
+    x_scaled = original_to_transformed_w_coords(np.array([x / 2]), scale_factor)[0]
+    y_scaled = original_to_transformed_h_coords(np.array([y / 2]), scale_factor)[0]
+    return [x_scaled, y_scaled]
+
+
+def scale_bbox_to_transformed_image(
+    top: float, left: float, bottom: float, right: float, scale_factor: float
+) -> list[int]:
+    top_scaled = original_to_transformed_w_coords(np.array([top / 2]), scale_factor)[0]
+    left_scaled = original_to_transformed_h_coords(np.array([left / 2]), scale_factor)[0]
+    bottom_scaled = original_to_transformed_w_coords(np.array([bottom / 2]), scale_factor)[0]
+    right_scaled = original_to_transformed_h_coords(np.array([right / 2]), scale_factor)[0]
+    return [top_scaled, left_scaled, bottom_scaled, right_scaled]
+
+
+@requires(backends=("vision",))
+class FuyuProcessor(ProcessorMixin):
+    r"""
+    Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.
+
+    [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`FuyuImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "FuyuImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer, **kwargs):
+        super().__init__(image_processor=image_processor, tokenizer=tokenizer)
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.max_tokens_to_generate = 10
+        self.max_position_embeddings = 16384  # TODO Can't derive this from model files: where to set it?
+        self.pad_token_id = 0
+        self.dummy_image_index = -1
+        self.image_token_id = tokenizer.encode("|SPEAKER|", add_special_tokens=False)[1]
+        self.image_newline_id = tokenizer.encode("|NEWLINE|", add_special_tokens=False)[1]
+
+    def _left_pad_inputs_with_attention_mask(self, model_inputs: list[dict], return_attention_mask: bool):
+        max_length_input_ids = max(entry["input_ids"].shape[1] for entry in model_inputs)
+        max_length_image_patch_indices = max(entry["image_patches_indices"].shape[1] for entry in model_inputs)
+
+        batched_inputs = {"input_ids": [], "image_patches": [], "image_patches_indices": [], "attention_mask": []}
+
+        for entry in model_inputs:
+            for key, tensor in entry.items():
+                if key == "input_ids":
+                    num_padding_tokens = max_length_input_ids - tensor.shape[1]
+                    padded_input_ids = torch.cat(
+                        [
+                            torch.full((tensor.shape[0], num_padding_tokens), self.pad_token_id, dtype=torch.long),
+                            tensor,
+                        ],
+                        dim=1,
+                    )
+                    batched_inputs[key].append(padded_input_ids)
+
+                    attention_mask = torch.cat(
+                        [torch.zeros(tensor.shape[0], num_padding_tokens, dtype=torch.long), torch.ones_like(tensor)],
+                        dim=1,
+                    )
+                    batched_inputs["attention_mask"].append(attention_mask)
+
+                elif key == "image_patches":
+                    # For image_patches, we don't pad but just append them to the list.
+                    batched_inputs[key].append(tensor)
+
+                else:  # for image_patches_indices
+                    num_padding_indices = max_length_image_patch_indices - tensor.shape[1]
+                    padded_indices = torch.cat(
+                        [
+                            torch.full(
+                                (tensor.shape[0], num_padding_indices), self.dummy_image_index, dtype=torch.long
+                            ),
+                            tensor,
+                        ],
+                        dim=1,
+                    )
+                    batched_inputs[key].append(padded_indices)
+        batched_keys = ["input_ids", "image_patches_indices"]
+        if return_attention_mask:
+            batched_keys.append("attention_mask")
+        for key in batched_keys:
+            batched_inputs[key] = torch.cat(batched_inputs[key], dim=0)
+
+        # Cast images to tensor as well, if only one image passed and no padding needed
+        # NOTE: vLLM expects all processor outputs to be a tensor
+        if len(batched_inputs["image_patches"]) == 1:
+            batched_inputs["image_patches"] = torch.cat(batched_inputs["image_patches"], dim=0)
+
+        return batched_inputs
+
+    def get_sample_encoding(
+        self,
+        prompts,
+        scale_factors,
+        image_unpadded_heights,
+        image_unpadded_widths,
+        image_placeholder_id,
+        image_newline_id,
+        tensor_batch_images,
+    ):
+        image_present = torch.ones(1, 1, 1)
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
+        )
+        # FIXME max_tokens_to_generate is embedded into this processor's call.
+        prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+            tokenizer=self.tokenizer,
+            prompts=prompts,
+            scale_factors=scale_factors,
+            max_tokens_to_generate=self.max_tokens_to_generate,
+            max_position_embeddings=self.max_position_embeddings,
+            add_BOS=True,
+            add_beginning_of_answer_token=True,
+        )
+        image_padded_unpacked_tokens = construct_full_unpacked_stream(
+            num_real_text_tokens=prompts_length,
+            input_stream=prompt_tokens,
+            image_tokens=model_image_input["image_input_ids"],
+            batch_size=1,
+            num_sub_sequences=self.subsequence_length,
+        )
+        # Construct inputs for image patch indices.
+        unpacked_image_patch_indices_per_batch = construct_full_unpacked_stream(
+            num_real_text_tokens=prompts_length,
+            input_stream=torch.full_like(prompt_tokens, -1),
+            image_tokens=model_image_input["image_patch_indices_per_batch"],
+            batch_size=1,
+            num_sub_sequences=self.subsequence_length,
+        )
+        max_prompt_length = max(x.shape[-1] for x in image_padded_unpacked_tokens)
+        max_seq_len_batch = min(max_prompt_length + self.max_tokens_to_generate, self.max_position_embeddings)
+        tokens_to_place = min(max_seq_len_batch, max(0, image_padded_unpacked_tokens[0].shape[0]))
+
+        # Use same packing logic for the image patch indices.
+        image_patch_input_indices = full_unpacked_stream_to_tensor(
+            all_bi_tokens_to_place=[tokens_to_place],
+            full_unpacked_stream=unpacked_image_patch_indices_per_batch,
+            fill_value=-1,
+            batch_size=1,
+            new_seq_len=max_seq_len_batch,
+            offset=0,
+        )
+        image_patches_tensor = torch.stack([img[0] for img in model_image_input["image_patches"]])
+        batch_encoding = {
+            "input_ids": image_padded_unpacked_tokens[0].unsqueeze(0),
+            "image_patches": image_patches_tensor,
+            "image_patches_indices": image_patch_input_indices,
+        }
+        return batch_encoding
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[FuyuProcessorKwargs],
+    ) -> "FuyuBatchFeature":
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
+        encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `list[PIL.Image.Image]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+
+        Returns:
+            [`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields:
+
+            - **input_ids** -- Tensor of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **image_patches** -- List of Tensor of image patches. Returned when `images` is not `None`.
+            - **image_patches_indices** -- Tensor of indices where patch embeddings have to be inserted by the model.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model when
+              `return_attention_mask=True`.
+        """
+        requires_backends(self, ["torch"])
+
+        # --- Check input validity ---
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be None.")
+
+        output_kwargs = self._merge_kwargs(
+            FuyuProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+
+        if not output_kwargs["text_kwargs"].setdefault("return_attention_mask", True):
+            raise ValueError("`return_attention_mask=False` is not supported for this model.")
+
+        if text is not None and images is None:
+            logger.warning("You are processing a text with no associated image. Make sure it is intended.")
+            self.current_processor = self.tokenizer
+            text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
+            return text_encoding
+
+        if text is None and images is not None:
+            logger.warning("You are processing an image with no associated text. Make sure it is intended.")
+            prompts = [[""]]
+        if text is not None and images is not None:
+            if isinstance(text, str):
+                prompts = [[text]]
+            elif isinstance(text, list):
+                prompts = [[text_seq] for text_seq in text]
+
+        # --- Preprocess images using self.image_processor ---
+
+        # FIXME - We hard code "pt" here because the rest of the processing assumes torch tensors
+        output_kwargs["images_kwargs"]["return_tensors"] = "pt"
+        image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+        batch_images = image_encoding["images"]
+        image_unpadded_heights = image_encoding["image_unpadded_heights"]
+        image_unpadded_widths = image_encoding["image_unpadded_widths"]
+        scale_factors = image_encoding["image_scale_factors"]
+        self.subsequence_length = 1  # Each batch contains only one sequence.
+        self.batch_size = len(batch_images)
+
+        # --- Use self.tokenizer to get the ids of special tokens to insert into image ids ---
+
+        tensor_batch_images = torch.stack([img[0] for img in batch_images]).unsqueeze(1)
+
+        # --- Use self.image_processor again to obtain the full token ids and batch inputs ---
+        all_encodings = []
+
+        for prompt, scale_factor, image_unpadded_height, image_unpadded_width, tensor_batch_image in zip(
+            prompts, scale_factors, image_unpadded_heights, image_unpadded_widths, tensor_batch_images
+        ):
+            sample_encoding = self.get_sample_encoding(
+                prompts=[prompt],
+                scale_factors=[scale_factor],
+                image_unpadded_heights=torch.tensor([image_unpadded_height]),
+                image_unpadded_widths=torch.tensor([image_unpadded_width]),
+                image_placeholder_id=self.image_token_id,
+                image_newline_id=self.image_newline_id,
+                tensor_batch_images=tensor_batch_image.unsqueeze(0),
+            )
+            all_encodings.append(sample_encoding)
+
+        batch_encoding = self._left_pad_inputs_with_attention_mask(
+            model_inputs=all_encodings, return_attention_mask=True
+        )
+        if return_mm_token_type_ids:
+            input_ids = batch_encoding["input_ids"]
+            mm_token_type_ids = torch.zeros_like(input_ids)
+            mm_token_type_ids[input_ids == self.image_token_id] = 1
+            mm_token_type_ids[input_ids == self.image_newline_id] = 1
+            batch_encoding["mm_token_type_ids"] = mm_token_type_ids
+
+        return FuyuBatchFeature(data=batch_encoding)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            size = kwargs.get("size") or self.image_processor.size
+            padded_height, padded_width = size["height"], size["width"]
+
+            num_image_tokens = []
+            num_image_patches = [1] * len(image_sizes)
+            for image_size in image_sizes:
+                height_scale_factor = padded_height / image_size[0]
+                width_scale_factor = padded_width / image_size[1]
+                optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+                image_unpadded_h = min(int(image_size[0] * optimal_scale_factor), image_size[0])
+                image_unpadded_w = min(int(image_size[0] * optimal_scale_factor), image_size[0])
+
+                # We can use torch here because Fuyu processor has hard dependency on torch. NOTE: Fuyu can't do multi-image
+                # thus the below (1, 1, 1) is hardcoded. Same as when calling the processor
+                model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+                    image_input=torch.zeros(1, 1, 3, padded_height, padded_width),
+                    image_present=torch.ones(1, 1, 1),
+                    image_unpadded_h=torch.tensor([[image_unpadded_h]]),
+                    image_unpadded_w=torch.tensor([[image_unpadded_w]]),
+                    image_placeholder_id=0,  # dummy ids, we can be sure `id=0` is never out-of-range
+                    image_newline_id=0,
+                    variable_sized=True,
+                )
+                num_image_tokens.append(model_image_input["image_input_ids"][0][0].shape[-1])
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        return MultiModalData(**vision_data)
+
+    def post_process_box_coordinates(self, outputs, target_sizes=None):
+        """
+        Transforms raw coordinates detected by [`FuyuForCausalLM`] to the original images' coordinate space.
+        Coordinates will be returned in "box" format, with the following pattern:
+            `<box>top, left, bottom, right</box>`
+
+        Point coordinates are not supported yet.
+
+        Args:
+            outputs ([`GenerateOutput`]):
+                Raw outputs from `generate`.
+            target_sizes (`torch.Tensor`, *optional*):
+                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
+                the batch. If set, found coordinates in the output sequence are rescaled to the target sizes. If left
+                to None, coordinates will not be rescaled.
+
+        Returns:
+            `GenerateOutput`: Same output type returned by `generate`, with output token ids replaced with
+                boxed and possible rescaled coordinates.
+        """
+
+        def scale_factor_to_fit(original_size, target_size=None):
+            height, width = original_size
+            if target_size is None:
+                max_height = self.image_processor.size["height"]
+                max_width = self.image_processor.size["width"]
+            else:
+                max_height, max_width = target_size
+            if width <= max_width and height <= max_height:
+                return 1.0
+            return min(max_height / height, max_width / width)
+
+        def find_delimiters_pair(tokens, start_token, end_token):
+            start_id = self.tokenizer.convert_tokens_to_ids(start_token)
+            end_id = self.tokenizer.convert_tokens_to_ids(end_token)
+
+            starting_positions = (tokens == start_id).nonzero(as_tuple=True)[0]
+            ending_positions = (tokens == end_id).nonzero(as_tuple=True)[0]
+
+            if torch.any(starting_positions) and torch.any(ending_positions):
+                return (starting_positions[0], ending_positions[0])
+            return (None, None)
+
+        def tokens_to_boxes(tokens, original_size):
+            while (pair := find_delimiters_pair(tokens, TOKEN_BBOX_OPEN_STRING, TOKEN_BBOX_CLOSE_STRING)) != (
+                None,
+                None,
+            ):
+                start, end = pair
+                if end != start + 5:
+                    continue
+
+                # Retrieve transformed coordinates from tokens
+                coords = self.tokenizer.convert_ids_to_tokens(tokens[start + 1 : end])
+
+                # Scale back to original image size and multiply by 2
+                scale = scale_factor_to_fit(original_size)
+                top, left, bottom, right = [2 * int(float(c) / scale) for c in coords]
+
+                # Replace the IDs so they get detokenized right
+                replacement = f" {TEXT_REPR_BBOX_OPEN}{top}, {left}, {bottom}, {right}{TEXT_REPR_BBOX_CLOSE}"
+                replacement = self.tokenizer.tokenize(replacement)[1:]
+                replacement = self.tokenizer.convert_tokens_to_ids(replacement)
+                replacement = torch.tensor(replacement).to(tokens)
+
+                tokens = torch.cat([tokens[:start], replacement, tokens[end + 1 :]], 0)
+            return tokens
+
+        def tokens_to_points(tokens, original_size):
+            while (pair := find_delimiters_pair(tokens, TOKEN_POINT_OPEN_STRING, TOKEN_POINT_CLOSE_STRING)) != (
+                None,
+                None,
+            ):
+                start, end = pair
+                if end != start + 3:
+                    continue
+
+                # Retrieve transformed coordinates from tokens
+                coords = self.tokenizer.convert_ids_to_tokens(tokens[start + 1 : end])
+
+                # Scale back to original image size and multiply by 2
+                scale = scale_factor_to_fit(original_size)
+                x, y = [2 * int(float(c) / scale) for c in coords]
+
+                # Replace the IDs so they get detokenized right
+                replacement = f" {TEXT_REPR_POINT_OPEN}{x}, {y}{TEXT_REPR_POINT_CLOSE}"
+                replacement = self.tokenizer.tokenize(replacement)[1:]
+                replacement = self.tokenizer.convert_tokens_to_ids(replacement)
+                replacement = torch.tensor(replacement).to(tokens)
+
+                tokens = torch.cat([tokens[:start], replacement, tokens[end + 1 :]], 0)
+            return tokens
+
+        if target_sizes is None:
+            target_sizes = ((self.image_processor.size["height"], self.image_processor.size["width"]),) * len(outputs)
+        elif target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        if len(outputs) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as output sequences")
+
+        results = []
+        for seq, size in zip(outputs, target_sizes):
+            seq = tokens_to_boxes(seq, size)
+            seq = tokens_to_points(seq, size)
+            results.append(seq)
+
+        return results
+
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
+        """
+        Post-processes the output of `FuyuForConditionalGeneration` to only return the text output.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                containing the token ids of the generated sequences.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text output.
+        """
+        beginning_of_answer = self.tokenizer.convert_tokens_to_ids(BEGINNING_OF_ANSWER_STRING)
+        # get boa index for each outputted sequence tensor
+        # start all generated sequences from the beginning of the answer token, pad to have consistent length
+        unpadded_output_sequences = [
+            seq[(seq == beginning_of_answer).nonzero(as_tuple=True)[0] + 1 :] for seq in generated_outputs
+        ]
+        max_len = max(len(seq) for seq in unpadded_output_sequences)
+        # convert to torch and pad sequences
+        padded_output_sequences = torch.full((len(unpadded_output_sequences), max_len), self.pad_token_id)
+        for i, seq in enumerate(unpadded_output_sequences):
+            padded_output_sequences[i, : len(seq)] = torch.tensor(seq)
+
+        return self.batch_decode(padded_output_sequences, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+
+        # Make a copy of list when removing otherwise `self.image_processor.model_input_names` is also modified
+        extra_image_inputs = [
+            "image_input_ids",
+            "image_patch_indices_per_subsequence",
+            "images",
+            "image_patch_indices_per_batch",
+        ]
+        image_processor_input_names = [name for name in image_processor_input_names if name not in extra_image_inputs]
+        return list(tokenizer_input_names + image_processor_input_names + ["image_patches_indices"])
+
+
+__all__ = ["FuyuProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__init__.py b/phivenv/Lib/site-packages/transformers/models/gemma/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65fb1ca5edef4398753377796fc2609d0f17e1e8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gemma import *
+    from .modeling_flax_gemma import *
+    from .modeling_gemma import *
+    from .tokenization_gemma import *
+    from .tokenization_gemma_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49c01405636be23b4b8418c6e4914fe77b6c3c4c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c49805ac823227cf7ce22e6a4f6962d6e5800b38
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96fe61d8ca017ff614c07ad45003f602caa5caf3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb5a7cfce5c604a95ec9946a59ced4b13f565fe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3aa174c5cb9c62677aa225231b4d49b1b470a0da
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc06bd03253a8c16049b5b8237f923f7be5b202b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3270df320b749c2b87039e6f0c0cdc954b0b56e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/configuration_gemma.py b/phivenv/Lib/site-packages/transformers/models/gemma/configuration_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..363af5c3ffc4ccfe856763d332fe6fbce5886bae
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma/configuration_gemma.py
@@ -0,0 +1,160 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma/modular_gemma.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+
+
+class GemmaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma-7B.
+    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GemmaModel`]
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        hidden_activation (`str` or `function`, *optional*):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import GemmaModel, GemmaConfig
+    >>> # Initializing a Gemma gemma-7b style configuration
+    >>> configuration = GemmaConfig()
+    >>> # Initializing a model from the gemma-7b style configuration
+    >>> model = GemmaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        hidden_activation=None,
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.hidden_activation = hidden_activation
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["GemmaConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/modeling_flax_gemma.py b/phivenv/Lib/site-packages/transformers/models/gemma/modeling_flax_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..0addcd7dde7a01f3bd11316596e41e903db59438
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma/modeling_flax_gemma.py
@@ -0,0 +1,777 @@
+# coding=utf-8
+# Copyright 2024 Google Inc., and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax Gemma model."""
+
+from typing import Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_gemma import GemmaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GemmaConfig"
+_CHECKPOINT_FOR_DOC = "google/gemma-2b"
+_REAL_CHECKPOINT_FOR_DOC = "openlm-research/open_llama_3b_v2"
+
+GEMMA_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`GemmaConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or
+            `jax.numpy.bfloat16`.
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+GEMMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2)[: (dim // 2)] / dim))
+    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+
+    emb = np.concatenate((freqs, freqs), axis=-1)
+    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
+    return jnp.array(out[:, :, :num_pos])
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.rotate_half
+def rotate_half(tensor):
+    """Rotates half the hidden dims of the input."""
+    rotate_half_tensor = jnp.concatenate(
+        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
+    )
+    return rotate_half_tensor
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
+    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
+
+
+class FlaxGemmaRMSNorm(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.epsilon = self.config.rms_norm_eps
+        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
+
+    def __call__(self, hidden_states):
+        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
+        variance = jnp.power(variance, 2)
+        variance = variance.mean(-1, keepdims=True)
+        # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
+        hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
+
+        return (1 + self.weight) * jnp.asarray(hidden_states, dtype=self.dtype)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Gemma
+class FlaxGemmaRotaryEmbedding(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    # Ignore copy
+    def setup(self):
+        head_dim = self.config.head_dim
+        self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim)
+
+    def __call__(self, key, query, position_ids):
+        sincos = self.sincos[position_ids]
+        sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1)
+
+        key = apply_rotary_pos_emb(key, sin_pos, cos_pos)
+        query = apply_rotary_pos_emb(query, sin_pos, cos_pos)
+
+        key = jnp.asarray(key, dtype=self.dtype)
+        query = jnp.asarray(query, dtype=self.dtype)
+
+        return key, query
+
+
+class FlaxGemmaAttention(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
+
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
+
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+
+        kernel = jax.nn.initializers.normal(self.config.initializer_range)
+        self.q_proj = nn.Dense(
+            self.num_heads * self.head_dim, use_bias=config.attention_bias, dtype=self.dtype, kernel_init=kernel
+        )
+        self.k_proj = nn.Dense(
+            self.num_key_value_heads * self.head_dim,
+            use_bias=config.attention_bias,
+            dtype=self.dtype,
+            kernel_init=kernel,
+        )
+        self.v_proj = nn.Dense(
+            self.num_key_value_heads * self.head_dim,
+            use_bias=config.attention_bias,
+            dtype=self.dtype,
+            kernel_init=kernel,
+        )
+        self.o_proj = nn.Dense(self.embed_dim, use_bias=config.attention_bias, dtype=self.dtype, kernel_init=kernel)
+
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+        self.rotary_emb = FlaxGemmaRotaryEmbedding(config, dtype=self.dtype)
+
+    def _split_heads(self, hidden_states, num_heads):
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads * self.head_dim,))
+
+    @nn.compact
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, self.num_heads)
+        key = self._split_heads(key, self.num_key_value_heads)
+        value = self._split_heads(value, self.num_key_value_heads)
+
+        key, query = self.rotary_emb(key, query, position_ids)
+
+        query_length, key_length = query.shape[1], key.shape[1]
+
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+        )
+
+        key = jnp.repeat(key, repeats=self.num_key_value_groups, axis=2)
+        value = jnp.repeat(value, repeats=self.num_key_value_groups, axis=2)
+
+        # usual dot product attention
+        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_dropout,
+            deterministic=deterministic,
+            dtype=attention_dtype,
+        )
+
+        if self.attention_softmax_in_fp32:
+            attn_weights = attn_weights.astype(self.dtype)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.o_proj(attn_output)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxGemmaMLP(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
+
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+        if self.config.hidden_activation is None:
+            logger.warning_once(
+                "Gemma's activation function should be approximate GeLU and not exact GeLU. "
+                "Changing the activation function to `gelu_pytorch_tanh`."
+                f"if you want to use the legacy `{self.config.hidden_act}`, "
+                f"edit the `model.config` to set `hidden_activation={self.config.hidden_act}` "
+                "  instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details."
+            )
+            hidden_activation = "gelu_pytorch_tanh"
+        else:
+            hidden_activation = self.config.hidden_activation
+        self.act = ACT2FN[hidden_activation]
+
+        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+
+    def __call__(self, hidden_states):
+        up_proj_states = self.up_proj(hidden_states)
+        gate_states = self.act(self.gate_proj(hidden_states))
+
+        hidden_states = self.down_proj(up_proj_states * gate_states)
+        return hidden_states
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Gemma
+class FlaxGemmaDecoderLayer(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.input_layernorm = FlaxGemmaRMSNorm(self.config, dtype=self.dtype)
+        self.self_attn = FlaxGemmaAttention(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = FlaxGemmaRMSNorm(self.config, dtype=self.dtype)
+        self.mlp = FlaxGemmaMLP(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        outputs = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = outputs[0]
+        hidden_states = residual + attn_output
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + hidden_states
+
+        return (hidden_states,) + outputs[1:]
+
+
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Gemma, GPT_NEO->GEMMA, transformer->model
+class FlaxGemmaPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GemmaConfig
+    base_model_prefix = "model"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: GemmaConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: Optional[dict] = None,
+        past_key_values: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGemmaAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Gemma
+class FlaxGemmaLayerCollection(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxGemmaDecoderLayer(self.config, dtype=self.dtype, name=str(i))
+            for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxGemmaModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Gemma
+class FlaxGemmaModule(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.hidden_size = self.config.hidden_size
+        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
+        self.embed_tokens = nn.Embed(
+            self.config.vocab_size,
+            self.hidden_size,
+            embedding_init=embedding_init,
+            dtype=self.dtype,
+        )
+        self.layers = FlaxGemmaLayerCollection(self.config, dtype=self.dtype)
+        self.norm = FlaxGemmaRMSNorm(self.config, dtype=self.dtype)
+
+    # Ignore copy
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.embed_tokens(input_ids.astype("i4"))
+
+        input_embeds = input_embeds * (self.config.hidden_size**0.5)
+
+        outputs = self.layers(
+            input_embeds,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare Gemma Model transformer outputting raw hidden-states without any specific head on top.",
+    GEMMA_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModel with Llama->Gemma
+class FlaxGemmaModel(FlaxGemmaPreTrainedModel):
+    module_class = FlaxGemmaModule
+
+
+append_call_sample_docstring(
+    FlaxGemmaModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutput,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Gemma
+class FlaxGemmaForCausalLMModule(nn.Module):
+    config: GemmaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.model = FlaxGemmaModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    # Ignore copy
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.model(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.model.variables["params"]["embed_tokens"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The Gemma Model transformer with a language modeling head (linear layer) on top.
+    """,
+    GEMMA_START_DOCSTRING,
+)
+# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Gemma
+class FlaxGemmaForCausalLM(FlaxGemmaPreTrainedModel):
+    module_class = FlaxGemmaForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since Gemma uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxGemmaForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
+
+
+__all__ = ["FlaxGemmaForCausalLM", "FlaxGemmaModel", "FlaxGemmaPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/modeling_gemma.py b/phivenv/Lib/site-packages/transformers/models/gemma/modeling_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f72f27d9382aee69bc12d4c7b5b82e93d616dcd
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma/modeling_gemma.py
@@ -0,0 +1,504 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma/modular_gemma.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_gemma import GemmaConfig
+
+
+class GemmaRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class GemmaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class GemmaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GemmaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GemmaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GemmaConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class GemmaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GemmaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GemmaAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = GemmaMLP(config)
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class GemmaPreTrainedModel(PreTrainedModel):
+    config: GemmaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GemmaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": GemmaDecoderLayer,
+        "attentions": GemmaAttention,
+    }
+
+
+@auto_docstring
+class GemmaModel(GemmaPreTrainedModel):
+    def __init__(self, config: GemmaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = GemmaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # normalized
+        # Gemma downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+        hidden_states = hidden_states * normalizer
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@auto_docstring
+class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GemmaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GemmaForCausalLM
+
+        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class GemmaForSequenceClassification(GenericForSequenceClassification, GemmaPreTrainedModel):
+    pass
+
+
+class GemmaForTokenClassification(GenericForTokenClassification, GemmaPreTrainedModel):
+    pass
+
+
+__all__ = [
+    "GemmaModel",
+    "GemmaForCausalLM",
+    "GemmaForSequenceClassification",
+    "GemmaForTokenClassification",
+    "GemmaPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/modular_gemma.py b/phivenv/Lib/site-packages/transformers/models/gemma/modular_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..281fcd54fb7d526f1772ea81a425a154612dd824
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma/modular_gemma.py
@@ -0,0 +1,476 @@
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Optional
+
+import sentencepiece as spm
+import torch
+from torch import nn
+
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig
+from ...masking_utils import create_causal_mask
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...processing_utils import Unpack
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import TransformersKwargs, logging
+from ..llama.modeling_llama import (
+    LlamaForCausalLM,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaMLP,
+    LlamaModel,
+)
+from ..llama.tokenization_llama import LlamaTokenizer
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import TextInput
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+SPIECE_UNDERLINE = "▁"
+
+
+logger = logging.get_logger(__name__)
+
+
+class GemmaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma-7B.
+    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GemmaModel`]
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        hidden_activation (`str` or `function`, *optional*):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import GemmaModel, GemmaConfig
+    >>> # Initializing a Gemma gemma-7b style configuration
+    >>> configuration = GemmaConfig()
+    >>> # Initializing a model from the gemma-7b style configuration
+    >>> model = GemmaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        hidden_activation=None,
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.hidden_activation = hidden_activation
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class GemmaTokenizer(LlamaTokenizer, PreTrainedTokenizer):
+    """
+    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Gemma should be used.
+        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not to add spaces between special tokens.
+    """
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<bos>",
+        eos_token="<eos>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        spaces_between_special_tokens=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        PreTrainedTokenizer.__init__(
+            self,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+
+    def get_spm_processor(self):
+        raise AttributeError("Not needed for Gemma")
+
+    def unk_token_length(self):
+        raise AttributeError("Not needed for Gemma")
+
+    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
+        """
+        Args:
+            text: TextInput
+        Simply calls PreTrainedTokenizer's method
+        """
+        return PreTrainedTokenizer.tokenize(self, text, **kwargs)
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Args:
+            text: TextInput
+        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
+        """
+        return self.sp_model.encode(text, out_type=str)
+
+    def _decode(
+        self,
+        token_ids: list[int],
+        skip_special_tokens: bool = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        sub_texts = []
+        current_sub_text = []
+        for ids in token_ids:
+            if skip_special_tokens and ids in self.all_special_ids:
+                continue
+            if ids in self._added_tokens_decoder:
+                if current_sub_text:
+                    sub_texts.append(self.sp_model.decode(current_sub_text))
+                sub_texts.append(self._added_tokens_decoder[ids].content)
+                current_sub_text = []
+            else:
+                current_sub_text.append(ids)
+        if current_sub_text:
+            sub_texts.append(self.sp_model.decode(current_sub_text))
+
+        if spaces_between_special_tokens:
+            sub_texts = " ".join(sub_texts)
+        else:
+            sub_texts = "".join(sub_texts)
+
+        return sub_texts.replace(SPIECE_UNDERLINE, " ")
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self._added_tokens_encoder:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+
+class GemmaRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class GemmaMLP(LlamaMLP):
+    def __init__(self, config):
+        super().__init__(config)
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class GemmaModel(LlamaModel):
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # normalized
+        # Gemma downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+        hidden_states = hidden_states * normalizer
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+class GemmaForCausalLM(LlamaForCausalLM):
+    def forward(**super_kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GemmaForCausalLM
+
+        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        return super().forward(**super_kwargs)
+
+
+class GemmaForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class GemmaForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+__all__ = [
+    "GemmaConfig",
+    "GemmaTokenizer",
+    "GemmaModel",
+    "GemmaForCausalLM",
+    "GemmaForSequenceClassification",
+    "GemmaForTokenClassification",
+    "GemmaPreTrainedModel",  # noqa: F822
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/tokenization_gemma.py b/phivenv/Lib/site-packages/transformers/models/gemma/tokenization_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..3320968c2915dee8e4584470848d5480fb7f40c6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma/tokenization_gemma.py
@@ -0,0 +1,335 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma/modular_gemma.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Optional
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import TextInput
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+SPIECE_UNDERLINE = "▁"
+
+
+@requires(backends=("sentencepiece",))
+class GemmaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Gemma should be used.
+        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not to add spaces between special tokens.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<bos>",
+        eos_token="<eos>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        spaces_between_special_tokens=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
+        """
+        Args:
+            text: TextInput
+        Simply calls PreTrainedTokenizer's method
+        """
+        return super().tokenize(text, **kwargs)
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Args:
+            text: TextInput
+        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
+        """
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self._added_tokens_encoder:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of ids.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
+
+    def _decode(
+        self,
+        token_ids: list[int],
+        skip_special_tokens: bool = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        sub_texts = []
+        current_sub_text = []
+        for ids in token_ids:
+            if skip_special_tokens and ids in self.all_special_ids:
+                continue
+            if ids in self._added_tokens_decoder:
+                if current_sub_text:
+                    sub_texts.append(self.sp_model.decode(current_sub_text))
+                sub_texts.append(self._added_tokens_decoder[ids].content)
+                current_sub_text = []
+            else:
+                current_sub_text.append(ids)
+        if current_sub_text:
+            sub_texts.append(self.sp_model.decode(current_sub_text))
+
+        if spaces_between_special_tokens:
+            sub_texts = " ".join(sub_texts)
+        else:
+            sub_texts = "".join(sub_texts)
+
+        return sub_texts.replace(SPIECE_UNDERLINE, " ")
+
+
+__all__ = ["GemmaTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma/tokenization_gemma_fast.py b/phivenv/Lib/site-packages/transformers/models/gemma/tokenization_gemma_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc6e3d3593b9ff5403973af5a08d86820a0c2d3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma/tokenization_gemma_fast.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Optional
+
+from tokenizers import processors
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_gemma import GemmaTokenizer
+else:
+    GemmaTokenizer = None
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
+
+
+class GemmaTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a Gemma tokenizer fast. Based on byte-level Byte-Pair-Encoding.
+
+    This uses notably ByteFallback and no prefix space. Normalization is applied to replace  `" "` with `"▁"`
+
+    ```python
+    >>> from transformers import GemmaTokenizerFast
+
+    >>> tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
+    >>> tokenizer.encode("Hello this is a test")
+    [2, 4521, 736, 603, 476, 2121]
+    ```
+
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        tokenizer_file (`str`, *optional*):
+            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The padding token
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = GemmaTokenizer
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<unk>",
+        bos_token="<bos>",
+        eos_token="<eos>",
+        pad_token="<pad>",
+        add_bos_token=True,
+        add_eos_token=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.vocab_file = vocab_file
+
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+
+__all__ = ["GemmaTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/__init__.py b/phivenv/Lib/site-packages/transformers/models/gemma2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..18905bac42cc6b19f21e069355504e46d070d814
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gemma2 import *
+    from .modeling_gemma2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9206e672bd1baae6e698fdb4805ad046367c7f67
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/configuration_gemma2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/configuration_gemma2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92a5fbbbeb61c1622989a4bb00ec99b9f69e57df
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/configuration_gemma2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/modeling_gemma2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/modeling_gemma2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..350604035ca9fe1071232c41cef20454283e4576
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/modeling_gemma2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/modular_gemma2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/modular_gemma2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ddf036ff7b1ca001811c18f7ecd75f2061d3e59
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma2/__pycache__/modular_gemma2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/configuration_gemma2.py b/phivenv/Lib/site-packages/transformers/models/gemma2/configuration_gemma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ca6df873c228cdb2ebdb154285aaa2ac49e8f8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma2/configuration_gemma2.py
@@ -0,0 +1,182 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma2/modular_gemma2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+
+
+class Gemma2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma2-7B.
+    e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma2Model`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
+            scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
+            scaling factor when applying tanh softcapping on the attention scores.
+
+    ```python
+    >>> from transformers import Gemma2Model, Gemma2Config
+    >>> # Initializing a Gemma2 gemma2-7b style configuration
+    >>> configuration = Gemma2Config()
+    >>> # Initializing a model from the gemma2-7b style configuration
+    >>> model = Gemma2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
+        layer_types=None,
+        final_logit_softcapping=30.0,
+        attn_logit_softcapping=50.0,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.layer_types = layer_types
+
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+
+__all__ = ["Gemma2Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/modeling_gemma2.py b/phivenv/Lib/site-packages/transformers/models/gemma2/modeling_gemma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4427a08e23ebc1e60484bf2f6860743a7c366df
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma2/modeling_gemma2.py
@@ -0,0 +1,596 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma2/modular_gemma2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_gemma2 import Gemma2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma2RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class Gemma2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class Gemma2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Gemma2Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.attention_dropout = self.config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.attn_logit_softcapping = self.config.attn_logit_softcapping
+        self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            softcap=self.attn_logit_softcapping,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma2MLP(config)
+        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Gemma2RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Gemma2Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class Gemma2PreTrainedModel(PreTrainedModel):
+    config: Gemma2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Gemma2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Gemma2DecoderLayer,
+        "attentions": Gemma2Attention,
+    }
+
+
+@auto_docstring
+class Gemma2Model(Gemma2PreTrainedModel):
+    def __init__(self, config: Gemma2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma2RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # normalized
+        # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+        hidden_states = hidden_states * normalizer
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+@auto_docstring
+class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Gemma2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM
+
+        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Gemma2ForSequenceClassification(GenericForSequenceClassification, Gemma2PreTrainedModel):
+    pass
+
+
+class Gemma2ForTokenClassification(GenericForTokenClassification, Gemma2PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "Gemma2ForCausalLM",
+    "Gemma2Model",
+    "Gemma2PreTrainedModel",
+    "Gemma2ForSequenceClassification",
+    "Gemma2ForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma2/modular_gemma2.py b/phivenv/Lib/site-packages/transformers/models/gemma2/modular_gemma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f101ff1ec0a3e95d2bf57a261bbeac2ab1bdbe7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma2/modular_gemma2.py
@@ -0,0 +1,583 @@
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..gemma.modeling_gemma import (
+    GemmaAttention,
+    GemmaForCausalLM,
+    GemmaForSequenceClassification,
+    GemmaForTokenClassification,
+    GemmaMLP,
+    GemmaModel,
+    GemmaRMSNorm,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma2-7B.
+    e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma2Model`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
+            scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
+            scaling factor when applying tanh softcapping on the attention scores.
+
+    ```python
+    >>> from transformers import Gemma2Model, Gemma2Config
+    >>> # Initializing a Gemma2 gemma2-7b style configuration
+    >>> configuration = Gemma2Config()
+    >>> # Initializing a model from the gemma2-7b style configuration
+    >>> model = Gemma2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
+        layer_types=None,
+        final_logit_softcapping=30.0,
+        attn_logit_softcapping=50.0,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.layer_types = layer_types
+
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+
+class Gemma2RMSNorm(GemmaRMSNorm):
+    pass
+
+
+class Gemma2MLP(GemmaMLP):
+    def __init__(self, config):
+        super().__init__(config)
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class Gemma2Attention(GemmaAttention):
+    def __init__(self, config: Gemma2Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.attn_logit_softcapping = self.config.attn_logit_softcapping
+        self.attention_dropout = self.config.attention_dropout
+        self.is_causal = True
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            softcap=self.attn_logit_softcapping,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma2MLP(config)
+        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Gemma2Model(GemmaModel):
+    def __init__(self, config: Gemma2Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # normalized
+        # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+        hidden_states = hidden_states * normalizer
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Gemma2ForCausalLM(GemmaForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Gemma2Model(config)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM
+
+        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Gemma2ForSequenceClassification(GemmaForSequenceClassification):
+    pass
+
+
+class Gemma2ForTokenClassification(GemmaForTokenClassification):
+    pass
+
+
+__all__ = [
+    "Gemma2Config",
+    "Gemma2ForCausalLM",
+    "Gemma2Model",
+    "Gemma2PreTrainedModel",  # noqa: F822
+    "Gemma2ForSequenceClassification",
+    "Gemma2ForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__init__.py b/phivenv/Lib/site-packages/transformers/models/gemma3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ec82f91037a6988291c99971b27964b33e5193
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gemma3 import *
+    from .image_processing_gemma3 import *
+    from .image_processing_gemma3_fast import *
+    from .modeling_gemma3 import *
+    from .processing_gemma3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a82e8ab2434c1a128b97e2d1b2b88764f62cef17
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d440029510bb72597f0880c6e737bab08443feb
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bff781f73af6b3db9e27d5bfc422090f609d586
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4eda4bb859de15c2f8ad399c400214ee7399eca
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..abd792bd0479c614d8271705ff30457079a694f9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d59e14b9a3361a17680ec16a471ab5f1bc9c773
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a51df6459964b8a413045acf5c8019e5d5d8e367
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/configuration_gemma3.py b/phivenv/Lib/site-packages/transformers/models/gemma3/configuration_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0184c1993d3dc5ee63f30296905f70af3e020ba
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3/configuration_gemma3.py
@@ -0,0 +1,347 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Any, Optional, Union
+
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+from ..siglip import SiglipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma3Text-7B.
+    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3TextModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the attention scores.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=262_208,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=131_072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1_000_000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
+        layer_types=None,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.layer_types = layer_types
+
+        self.rope_local_base_freq = rope_local_base_freq
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+    @property
+    def sliding_window_pattern(self):
+        warnings.warn(
+            "The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
+            FutureWarning,
+        )
+        return self._sliding_window_pattern
+
+    @sliding_window_pattern.setter
+    def sliding_window_pattern(self, value):
+        self._sliding_window_pattern = value
+
+
+class Gemma3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
+    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "boi_token_id": "boi_token_index",
+        "eoi_token_id": "eoi_token_index",
+    }
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3TextConfig, dict[str, Any]]] = None,
+        vision_config: Optional[Union[SiglipVisionConfig, dict[str, Any]]] = None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(text_config, dict):
+            text_config = Gemma3TextConfig(**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config = SiglipVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["Gemma3Config", "Gemma3TextConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/image_processing_gemma3.py b/phivenv/Lib/site-packages/transformers/models/gemma3/image_processing_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7bd414dbb9181c131962e9f51facc56df83b448
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3/image_processing_gemma3.py
@@ -0,0 +1,409 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Gemma3."""
+
+import itertools
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class Gemma3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SigLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+            `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_pan_and_scan (`bool`, *optional*):
+            Whether to apply `pan_and_scan` to images.
+        pan_and_scan_min_crop_size (`int`, *optional*):
+            Minimum size of each crop in pan and scan.
+        pan_and_scan_max_num_crops (`int`, *optional*):
+            Maximum number of crops per image in pan and scan.
+        pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+            Minimum aspect ratio to activate pan and scan.
+    """
+
+    model_input_names = ["pixel_values", "num_crops"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = True,
+        do_pan_and_scan: Optional[bool] = None,
+        pan_and_scan_min_crop_size: Optional[int] = None,
+        pan_and_scan_max_num_crops: Optional[int] = None,
+        pan_and_scan_min_ratio_to_activate: Optional[float] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pan_and_scan = do_pan_and_scan
+        self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size
+        self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops
+        self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate
+
+    def pan_and_scan(
+        self,
+        image: np.ndarray,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
+        minimum allowed ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image)
+
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+
+        if input_data_format == ChannelDimension.LAST:
+            image_crops = [
+                image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+            ]
+        else:
+            image_crops = [
+                image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+            ]
+
+        return image_crops
+
+    def _process_images_for_pan_and_scan(
+        self,
+        images: list[np.ndarray],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        pas_images_list = []
+        num_crops = []
+        for image in images:
+            pas_images = self.pan_and_scan(
+                image=image,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            pas_images_list.extend([image] + pas_images)
+            num_crops.append(len(pas_images))
+        return pas_images_list, num_crops
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_pan_and_scan: Optional[bool] = None,
+        pan_and_scan_min_crop_size: Optional[int] = None,
+        pan_and_scan_max_num_crops: Optional[int] = None,
+        pan_and_scan_min_ratio_to_activate: Optional[float] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_pan_and_scan (`bool`, *optional*, defaults to `self.do_pan_and_scan`):
+                Whether to apply `pan_and_scan` to images.
+            pan_and_scan_min_crop_size (`int`, *optional*, defaults to `self.pan_and_scan_min_crop_size`):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*, defaults to `self.pan_and_scan_max_num_crops`):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*, defaults to `self.pan_and_scan_min_ratio_to_activate`):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pan_and_scan = do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan
+        pan_and_scan_min_crop_size = (
+            pan_and_scan_min_crop_size if pan_and_scan_min_crop_size is not None else self.pan_and_scan_min_crop_size
+        )
+        pan_and_scan_max_num_crops = (
+            pan_and_scan_max_num_crops if pan_and_scan_max_num_crops is not None else self.pan_and_scan_max_num_crops
+        )
+        pan_and_scan_min_ratio_to_activate = (
+            pan_and_scan_min_ratio_to_activate
+            if pan_and_scan_min_ratio_to_activate is not None
+            else self.pan_and_scan_min_ratio_to_activate
+        )
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_pan_and_scan:
+            images, num_crops = self._process_images_for_pan_and_scan(
+                images=images,
+                do_pan_and_scan=do_pan_and_scan,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+
+        else:
+            num_crops = [0 for _ in images]
+
+        processed_images = []
+        for image in images:
+            if do_resize:
+                height, width = size["height"], size["width"]
+                image = resize(
+                    image=image, size=(height, width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        data = {"pixel_values": processed_images, "num_crops": num_crops}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Gemma3ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/image_processing_gemma3_fast.py b/phivenv/Lib/site-packages/transformers/models/gemma3/image_processing_gemma3_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce7b508b270b3a4bdca1bddfb4cfa70f7f0eb36
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -0,0 +1,272 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for SigLIP."""
+
+import itertools
+import math
+from typing import Optional, Union
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+    logging,
+)
+
+
+if is_vision_available():
+    from ...image_utils import PILImageResampling
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    do_pan_and_scan (`bool`, *optional*):
+        Whether to apply `pan_and_scan` to images.
+    pan_and_scan_min_crop_size (`int`, *optional*):
+        Minimum size of each crop in pan and scan.
+    pan_and_scan_max_num_crops (`int`, *optional*):
+        Maximum number of crops per image in pan and scan.
+    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+        Minimum aspect ratio to activate pan and scan.
+    """
+
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+
+
+@auto_docstring
+class Gemma3ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    default_to_square = True
+    do_convert_rgb = True
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pan_and_scan = None
+    pan_and_scan_min_crop_size = None
+    pan_and_scan_max_num_crops = None
+    pan_and_scan_min_ratio_to_activate = None
+    valid_kwargs = Gemma3FastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def pan_and_scan_batched(
+        self,
+        images: "torch.Tensor",
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        """
+        Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
+        minimum allowed ratio.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        height, width = images.shape[-2:]
+
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+
+        return [
+            images[..., pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+            for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+        ]
+
+    def _process_images_for_pan_and_scan(
+        self,
+        images: list["torch.Tensor"],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        pas_images = self.pan_and_scan_batched(
+            images=images,
+            pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+            pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+            pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+        )
+        num_crops = [len(pas_images) for _ in images]
+        return pas_images, num_crops
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        **kwargs: Unpack[Gemma3FastImageProcessorKwargs],
+    ) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list[list["torch.Tensor"]],
+        do_resize: bool,
+        size: SizeDict,
+        do_pan_and_scan: Optional[bool],
+        pan_and_scan_min_crop_size: Optional[int],
+        pan_and_scan_max_num_crops: Optional[int],
+        pan_and_scan_min_ratio_to_activate: Optional[float],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched processing
+        processed_images_grouped = {}
+        num_crops_grouped = {}
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        for shape_images, stacked_images in grouped_images.items():
+            if do_pan_and_scan:
+                pas_images, num_crops = self._process_images_for_pan_and_scan(
+                    images=stacked_images,
+                    do_pan_and_scan=do_pan_and_scan,
+                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                )
+                # Add the thumbnails to the image patches
+                stacked_images = [stacked_images] + pas_images
+                # Group images by size for batched resizing (this will typically group thumbnails together and cropped patches together)
+                processed_image_patches_grouped = {}
+                grouped_image_patches, grouped_image_patches_index = group_images_by_shape(
+                    stacked_images, disable_grouping=disable_grouping
+                )
+                for shape, stacked_image_patches in grouped_image_patches.items():
+                    stacked_image_patches = self.resize(
+                        image=stacked_image_patches,
+                        size=size,
+                        interpolation=interpolation,
+                    )
+                    processed_image_patches_grouped[shape] = stacked_image_patches
+                processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
+                # Transpose to have the thumbnails with their corresponding patches
+                stacked_images = torch.stack(processed_image_patches, dim=0).transpose(0, 1).contiguous()
+            else:
+                num_crops = [0 for _ in stacked_images]
+
+                if do_resize:
+                    stacked_images = self.resize(
+                        image=stacked_images,
+                        size=size,
+                        interpolation=interpolation,
+                    )
+            num_crops_grouped[shape_images] = num_crops
+            processed_images_grouped[shape_images] = stacked_images
+        resized_images = reorder_images(processed_images_grouped, grouped_images_index)
+        # If pan and scan is enabled, we need to flatten the list of images
+        if do_pan_and_scan:
+            resized_images = [image for images_list in resized_images for image in images_list]
+        num_crops = reorder_images(num_crops_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_crops": num_crops}, tensor_type=return_tensors
+        )
+
+
+__all__ = ["Gemma3ImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/modeling_gemma3.py b/phivenv/Lib/site-packages/transformers/models/gemma3/modeling_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..83a1283b56e2670936d5306f6277764a76b498e0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3/modeling_gemma3.py
@@ -0,0 +1,1308 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask, create_masks_for_generate, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel
+from .configuration_gemma3 import Gemma3Config, Gemma3TextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3 outputs, with hidden states and attentions.
+    """
+)
+class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3 causal language model (or autoregressive) outputs.
+    """
+)
+class Gemma3CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+
+
+class Gemma3MLP(nn.Module):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Gemma3RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class Gemma3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class Gemma3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.attention_dropout = self.config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.attn_logit_softcapping = self.config.attn_logit_softcapping
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class Gemma3PreTrainedModel(PreTrainedModel):
+    config: Gemma3Config
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Gemma3DecoderLayer,
+        "attentions": Gemma3Attention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Gemma3MultiModalProjector):
+            module.mm_input_projection_weight.data.zero_()
+
+
+@auto_docstring
+class Gemma3TextModel(Gemma3PreTrainedModel):
+    config: Gemma3TextConfig
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+        self.layers = nn.ModuleList(
+            [Gemma3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings_global = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings_global=position_embeddings_global,
+                position_embeddings_local=position_embeddings_local,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+@auto_docstring
+class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config: Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
+
+        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma3 models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+def token_type_ids_mask_function(
+    token_type_ids: Optional[torch.Tensor],
+    image_group_ids: Optional[torch.Tensor],
+    tokens_per_image: int,
+) -> Optional[Callable]:
+    """
+    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
+    not start and end indices.
+    """
+    # Do not return an additional mask in this case
+    if token_type_ids is None:
+        return None
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # If it's 1 for both query and key/value, we are in an image block
+        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
+        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
+        safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
+        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)
+
+        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_idx]
+        image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)
+
+        is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
+        same_image_block = image_group_ids[batch_idx, q_idx] == image_group_ids_at_kv_idx
+
+        # This is bidirectional attention whenever we are dealing with image tokens
+        return is_image_block & same_image_block
+
+    return inner_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The Base Gemma3 model which consists of a vision backbone and a language model withou language modeling head.,
+    """
+)
+class Gemma3Model(Gemma3PreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+
+        language_model = AutoModel.from_config(config=config.text_config)
+        self.language_model = language_model
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3ModelOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")
+
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```"""
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config.get_text_config(),
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            if token_type_ids is not None and inputs_embeds.shape[1] != 1:
+                # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+                # First find where a new image block starts: 1 if image and previous not image
+                # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+                is_image = (token_type_ids == 1).to(cache_position.device)
+                new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+                image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+                image_group_ids = torch.where(
+                    is_image, image_group_ids, torch.full_like(token_type_ids, -1, device=is_image.device)
+                )
+                mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                    token_type_ids.to(cache_position.device), image_group_ids, self.config.mm_tokens_per_image
+                )
+
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        outputs = self.language_model(
+            attention_mask=causal_mask_mapping,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        return Gemma3ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values if use_cache else None,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
+    """
+)
+class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        self.model = Gemma3Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(self, pixel_values):
+        return self.model.get_image_features(pixel_values)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenize=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    @staticmethod
+    def create_masks_for_generate(
+        config: PretrainedConfig,
+        input_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        cache_position: torch.Tensor,
+        past_key_values: Optional[Cache],
+        position_ids: Optional[torch.Tensor],
+        token_type_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # Prepare mask arguments
+        mask_kwargs = {
+            "config": config.get_text_config(),
+            "input_embeds": input_embeds,
+            "attention_mask": attention_mask,
+            "cache_position": cache_position,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        # Add the token type ids mask for generate as well
+        if token_type_ids is not None and input_embeds.shape[1] != 1:
+            # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+            # First find where a new image block starts: 1 if image and previous not image
+            # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+            is_image = (token_type_ids == 1).to(cache_position.device)
+            new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+            image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+            image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))
+            mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                token_type_ids.to(cache_position.device), image_group_ids, config.mm_tokens_per_image
+            )
+
+        return create_masks_for_generate(**mask_kwargs)
+
+
+class Gemma3ForSequenceClassification(Gemma3PreTrainedModel):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+    }
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma3Model(config)
+        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            token_type_ids=token_type_ids,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.text_config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.text_config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.text_config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "Gemma3PreTrainedModel",
+    "Gemma3TextModel",
+    "Gemma3ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3Model",
+    "Gemma3ForSequenceClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/modular_gemma3.py b/phivenv/Lib/site-packages/transformers/models/gemma3/modular_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e06671ea0bbd983714191e5cb97b01ac41a7212
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3/modular_gemma3.py
@@ -0,0 +1,1178 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import warnings
+from collections.abc import Callable
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...masking_utils import create_causal_mask, create_masks_for_generate, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..gemma2.configuration_gemma2 import Gemma2Config
+from ..gemma2.modeling_gemma2 import (
+    Gemma2Attention,
+    Gemma2ForCausalLM,
+    Gemma2MLP,
+    Gemma2Model,
+    Gemma2PreTrainedModel,
+    Gemma2RMSNorm,
+    Gemma2RotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from ..paligemma.modeling_paligemma import (
+    PaligemmaCausalLMOutputWithPast,
+    PaliGemmaForConditionalGeneration,
+    PaliGemmaModel,
+    PaligemmaModelOutputWithPast,
+)
+from ..siglip import SiglipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma3Text-7B.
+    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3TextModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the attention scores.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3_text"
+
+    def __init__(
+        self,
+        vocab_size=262_208,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=131_072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1_000_000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
+        layer_types=None,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        **kwargs,
+    ):
+        PretrainedConfig.__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.layer_types = layer_types
+
+        self.rope_local_base_freq = rope_local_base_freq
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+    @property
+    def sliding_window_pattern(self):
+        warnings.warn(
+            "The `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.",
+            FutureWarning,
+        )
+        return self._sliding_window_pattern
+
+    @sliding_window_pattern.setter
+    def sliding_window_pattern(self, value):
+        self._sliding_window_pattern = value
+
+
+class Gemma3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
+    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "boi_token_id": "boi_token_index",
+        "eoi_token_id": "eoi_token_index",
+    }
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3TextConfig, dict[str, Any]]] = None,
+        vision_config: Optional[Union[SiglipVisionConfig, dict[str, Any]]] = None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(text_config, dict):
+            text_config = Gemma3TextConfig(**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config = SiglipVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+class Gemma3ModelOutputWithPast(PaligemmaModelOutputWithPast):
+    pass
+
+
+class Gemma3CausalLMOutputWithPast(PaligemmaCausalLMOutputWithPast):
+    pass
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+
+
+class Gemma3MLP(Gemma2MLP):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+
+
+class Gemma3RMSNorm(Gemma2RMSNorm):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__(dim=dim, eps=eps)
+
+
+class Gemma3RotaryEmbedding(Gemma2RotaryEmbedding):
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__(config)
+
+
+# Weird way to inherit but otherwise the sliding window gets defined first and can't access `is_sliding`
+class Gemma3Attention(Gemma2Attention):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+
+        super().__init__(config, layer_idx)
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+GEMMA3_START_DOCSTRING = None
+
+
+class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
+    base_model_prefix = ""
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, Gemma3MultiModalProjector):
+            module.mm_input_projection_weight.data.zero_()
+
+
+class Gemma3TextModel(Gemma2Model):
+    config: Gemma3TextConfig
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+
+        # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings_global = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings_global=position_embeddings_global,
+                position_embeddings_local=position_embeddings_local,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Gemma3ForCausalLM(Gemma2ForCausalLM):
+    config: Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+def token_type_ids_mask_function(
+    token_type_ids: Optional[torch.Tensor],
+    image_group_ids: Optional[torch.Tensor],
+    tokens_per_image: int,
+) -> Optional[Callable]:
+    """
+    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
+    not start and end indices.
+    """
+    # Do not return an additional mask in this case
+    if token_type_ids is None:
+        return None
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # If it's 1 for both query and key/value, we are in an image block
+        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
+        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
+        safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
+        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)
+
+        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_idx]
+        image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)
+
+        is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
+        same_image_block = image_group_ids[batch_idx, q_idx] == image_group_ids_at_kv_idx
+
+        # This is bidirectional attention whenever we are dealing with image tokens
+        return is_image_block & same_image_block
+
+    return inner_mask
+
+
+class Gemma3Model(PaliGemmaModel):
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+
+    def _update_causal_mask(self, **super_kwargs):
+        raise AttributeError("We don't want to inherit it")
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3ModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config.get_text_config(),
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            if token_type_ids is not None and inputs_embeds.shape[1] != 1:
+                # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+                # First find where a new image block starts: 1 if image and previous not image
+                # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+                is_image = (token_type_ids == 1).to(cache_position.device)
+                new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+                image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+                image_group_ids = torch.where(
+                    is_image, image_group_ids, torch.full_like(token_type_ids, -1, device=is_image.device)
+                )
+                mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                    token_type_ids.to(cache_position.device), image_group_ids, self.config.mm_tokens_per_image
+                )
+
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        outputs = self.language_model(
+            attention_mask=causal_mask_mapping,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        return Gemma3ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values if use_cache else None,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenize=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    def _prepare_4d_causal_attention_mask_with_cache_position(self, **super_kwargs):
+        raise AttributeError("We don't want to inherit it")
+
+    @staticmethod
+    def create_masks_for_generate(
+        config: PretrainedConfig,
+        input_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        cache_position: torch.Tensor,
+        past_key_values: Optional[Cache],
+        position_ids: Optional[torch.Tensor],
+        token_type_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # Prepare mask arguments
+        mask_kwargs = {
+            "config": config.get_text_config(),
+            "input_embeds": input_embeds,
+            "attention_mask": attention_mask,
+            "cache_position": cache_position,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        # Add the token type ids mask for generate as well
+        if token_type_ids is not None and input_embeds.shape[1] != 1:
+            # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+            # First find where a new image block starts: 1 if image and previous not image
+            # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+            is_image = (token_type_ids == 1).to(cache_position.device)
+            new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+            image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+            image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))
+            mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                token_type_ids.to(cache_position.device), image_group_ids, config.mm_tokens_per_image
+            )
+
+        return create_masks_for_generate(**mask_kwargs)
+
+
+class Gemma3ForSequenceClassification(Gemma3PreTrainedModel):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+    }
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma3Model(config)
+        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            token_type_ids=token_type_ids,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.text_config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.text_config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.text_config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "Gemma3Config",
+    "Gemma3TextConfig",
+    "Gemma3PreTrainedModel",  # noqa: F822
+    "Gemma3TextModel",
+    "Gemma3ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3Model",
+    "Gemma3ForSequenceClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3/processing_gemma3.py b/phivenv/Lib/site-packages/transformers/models/gemma3/processing_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c27053e1a6f1779690df6da0efd49315f214438
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3/processing_gemma3.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, make_nested_list_of_images
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import to_py_obj
+
+
+class Gemma3ImagesKwargs(ImagesKwargs):
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+    do_convert_rgb: Optional[bool]
+
+
+class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Gemma3ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "images_kwargs": {
+            "do_convert_rgb": True,
+            "do_pan_and_scan": False,
+            "pan_and_scan_min_crop_size": 256,
+            "pan_and_scan_max_num_crops": 4,
+            "pan_and_scan_min_ratio_to_activate": 1.2,
+        },
+    }
+
+
+class Gemma3Processor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        image_seq_length: int = 256,
+        **kwargs,
+    ):
+        self.image_seq_length = image_seq_length
+        self.image_token_id = tokenizer.image_token_id
+        self.boi_token = tokenizer.boi_token
+        self.image_token = tokenizer.image_token
+        image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
+        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
+
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos=None,
+        audio=None,
+        **kwargs: Unpack[Gemma3ProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None and images is None:
+            raise ValueError("Provide at least one of `text` or `images`.")
+
+        output_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        image_inputs = {}
+        if images is not None:
+            images = self.image_processor.fetch_images(images)
+            batched_images = make_nested_list_of_images(images)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+            # Create empty text to be replaced with placeholders
+            if not text:
+                text = [" ".join([self.boi_token] * len(images)) for images in batched_images]
+
+            if len(batched_images) != len(text):
+                raise ValueError(
+                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
+                )
+
+            # Replace image tokens by the full expanded sequence
+            num_crops = to_py_obj(image_inputs.pop("num_crops"))
+            batch_num_crops = [[num_crops.pop(0) for _ in range(len(images))] for images in batched_images]
+            for batch_idx, (prompt, images, num_crops) in enumerate(zip(text, batched_images, batch_num_crops)):
+                image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
+
+                if len(images) != len(image_indexes):
+                    raise ValueError(
+                        f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images."
+                    )
+
+                # Insert additional image tokens for Pan-and-Scan crops
+                for num, idx in reversed(list(zip(num_crops, image_indexes))):
+                    if num:
+                        formatted_image_text = (
+                            f"Here is the original image {self.boi_token} and here are some crops to help you see better "
+                            + " ".join([self.boi_token] * num)
+                        )
+                        prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token) :]
+                        text[batch_idx] = prompt
+
+            # Expand placeholder image tokens to the full image token sequence
+            text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        # Add token type ids manually, as tokenizer can't do arbitrary position token types
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(array_ids)
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            # NOTE: no image cropping supported yet
+            num_image_tokens = [self.image_seq_length] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_input_names = self.image_processor.model_input_names
+
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
+        return list(tokenizer_input_names + image_processor_input_names)
+
+
+__all__ = ["Gemma3Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/__init__.py b/phivenv/Lib/site-packages/transformers/models/gemma3n/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..229e91827036d0830593ea9294e232cffefbac7b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3n/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gemma3n import *
+    from .feature_extraction_gemma3n import *
+    from .modeling_gemma3n import *
+    from .processing_gemma3n import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..779063db345e7a83b909a23bf252cc50809b48c3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/configuration_gemma3n.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/configuration_gemma3n.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08c89d9762fe74ffd577c0f5c8d593925d44fa3b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/configuration_gemma3n.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/feature_extraction_gemma3n.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/feature_extraction_gemma3n.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f43a9ce1743a3d32de679b1cb33496de9ca75b7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/feature_extraction_gemma3n.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/modeling_gemma3n.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/modeling_gemma3n.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7204c9f009c591b53add6da1067075796bfd28ff
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/modeling_gemma3n.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/modular_gemma3n.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/modular_gemma3n.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b3110dffbe6f27f383a7d0c0d6a65256ee7f2da
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/modular_gemma3n.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/processing_gemma3n.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/processing_gemma3n.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..faaa80210713a945c014ddd137bb28ab2b2f56fe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gemma3n/__pycache__/processing_gemma3n.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/configuration_gemma3n.py b/phivenv/Lib/site-packages/transformers/models/gemma3n/configuration_gemma3n.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cb2e1f7c7b29e8eee7bfb28162474afad7f39fe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3n/configuration_gemma3n.py
@@ -0,0 +1,684 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3n/modular_gemma3n.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3n.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import is_timm_available, logging, requires_backends
+
+
+if is_timm_available():
+    from timm.data import ImageNetInfo, infer_imagenet_subset
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3nTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3nTextModel`]. It is used to instantiate an
+    Gemma3nTextModel model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.
+    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+
+    Configuration objects that inherit from [`Gemma3nTextConfig`] and can be used to control the model outputs. Read
+    the documentation from [`Gemma3nTextConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262400):
+            Vocabulary size of the Gemma3nText model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Gemma3nTextModel`]
+        vocab_size_per_layer_input (`int`, *optional*, defaults to 262144):
+            Vocabulary size of the per-layer text embeddings that augment the standard embeddings.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        hidden_size_per_layer_input (`int`, *optional*, defaults to 256):
+            Dimension of the hidden representations for per-layer emebeddings.
+        intermediate_size (`int` or `Sequence[int]`, *optional*, defaults to 16384):
+            Dimension of the MLP representations. MatFormer configurations may wish to provide a sequence of integers
+            to account for vairable intermediate_size values across layers. In such cases,
+            `len(intermediate_size) == num_hidden_layers`.
+        num_hidden_layers (`int`, *optional*, defaults to 35):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout this
+            [paper](https://huggingface.co/papers/2305.13245). If not specified, will default to `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to
+            `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"`
+            activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention.
+            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we
+            recommend you to update this value accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 512):
+            This is the size of the sliding window used by local attention layers.
+        layer_types (`Optional`, *optional*):
+            A sequence of strings defining the attention type for that layer as either "sliding_attention" or
+            "full_attention". If not provided, `layer_types` will de inferred from `num_hidden_layers` using a pattern
+            of four "sliding_attention" layers followed one "full_attention". The last layer in the model should always
+            be a "full_attention" layer.
+        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
+            Scaling factor when applying tanh softcapping on the logits.
+        altup_active_idx (`int`, *optional*, defaults to 0):
+            The index of the prediction from which AltUp will compute additional predictions or correct
+        altup_coef_clip (`float`, *optional*, defaults to 120.0):
+            The maximum amplitude of an AltUp prediction or correction coeficient weight.
+        altup_correct_scale (`bool`, *optional*, defaults to `True`):
+            If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`.
+        altup_num_inputs (`int`, *optional*, defaults to 4):
+            The number of predictions that AltUp should be make given the input sequence.
+        num_kv_shared_layers (`int`, *optional*, defaults to 15):
+            The number of layer that share KV cache values. During the forward pass, the last `num_kv_shared_layers`
+            layers in the model "share" the KV values in that each local and global layer in this range uses the KV
+            cache values computed for the last local or global layer, respectively, before entering this range. The
+            value should be a multiple of the attention pattern size (see `layer_types` parameter).
+        laurel_rank (int, *optional*, defaults to 64):
+            The intermediate size for the linear projections in the Learned Augmented Residual Layer.
+        activation_sparsity_pattern (Sequence[float], *optional*):
+            The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must
+            explicitly provide a sparsity value for each layer in the model. By default, the first 10 layers are
+            sparse with a sparsity factor of 0.95 and the rest are dense.
+
+    ```python
+    >>> from transformers import Gemma3nTextModel, Gemma3nTextConfig
+
+    >>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration
+    >>> configuration = Gemma3nTextConfig()
+
+    >>> # Initializing a model from the gemma3n_text-E4B style configuration
+    >>> model = Gemma3nTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3n_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size: int = 262_400,
+        vocab_size_per_layer_input: int = 262_144,
+        hidden_size: int = 2048,
+        hidden_size_per_layer_input: int = 256,
+        intermediate_size: Union[int, Sequence[int]] = 16_384,
+        num_hidden_layers: int = 35,
+        num_attention_heads: int = 8,
+        num_key_value_heads: int = 2,
+        head_dim: int = 256,
+        hidden_activation: str = "gelu_pytorch_tanh",
+        max_position_embeddings: int = 32_768,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        pad_token_id: int = 0,
+        eos_token_id: int = 1,
+        bos_token_id: int = 2,
+        rope_theta: float = 1_000_000.0,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_local_base_freq: float = 10_000.0,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        sliding_window: int = 512,
+        layer_types: Optional[Sequence[str]] = None,
+        final_logit_softcapping: float = 30.0,
+        altup_active_idx: int = 0,
+        altup_coef_clip: float = 120.0,
+        altup_correct_scale: bool = True,
+        altup_num_inputs: int = 4,
+        num_kv_shared_layers: int = 15,
+        laurel_rank: int = 64,
+        activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers:
+            raise ValueError(
+                "intermediate_size must have an explicit intermediate size for every layer or one for all layers. "
+                f"Expected {num_hidden_layers} values but got {intsize_len}."
+            )
+        elif not isinstance(intermediate_size, Sequence):
+            intermediate_size = [intermediate_size] * num_hidden_layers
+
+        self.vocab_size = vocab_size
+        self.vocab_size_per_layer_input = vocab_size_per_layer_input
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.layer_types = layer_types
+
+        self.rope_local_base_freq = rope_local_base_freq
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+        if layer_types is None:
+            self.layer_types = [
+                "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)
+            ]
+        else:
+            self.layer_types = layer_types
+
+        layer_type_validation(self.layer_types)
+
+        self.hidden_size_per_layer_input = hidden_size_per_layer_input
+        self.num_kv_shared_layers = num_kv_shared_layers
+
+        self.altup_active_idx = altup_active_idx
+        self.altup_coef_clip = altup_coef_clip
+        self.altup_correct_scale = altup_correct_scale
+        self.altup_num_inputs = altup_num_inputs
+
+        self.laurel_rank = laurel_rank
+
+        if activation_sparsity_pattern is None:
+            num_sparse_layers = 10 if num_hidden_layers > 10 else 0
+            activation_sparsity_pattern = (0.95,) * num_sparse_layers + (0.0,) * (
+                num_hidden_layers - num_sparse_layers
+            )
+
+        if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers:
+            raise ValueError(
+                "activation_sparsity_pattern must have an explicit activation sparsity value for every layer."
+                f"Expected {num_hidden_layers} values but got {len_asp}."
+            )
+        self.activation_sparsity_pattern = activation_sparsity_pattern
+
+
+class Gemma3nAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate
+    an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.,
+    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+
+    Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read
+    the documentation from [`Gemma3nAudioConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 128):
+            Vocabulary size of the additional hard-token embeddings for audio model. These augment the embeddings
+            included in the `Gemma3nTextModel` to provide, e.g., the end of audio and audio soft token placeholder
+            tokens when converting `input_ids` to embeddings in the `Gemma3nForConditionalGeneration` model.
+        vocab_offset (`int`, *optional*, defaults to 262272):
+            Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
+            0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
+        input_feat_size (`int`, *optional*, defaults to 128):
+            The number of channels in each mel-spectrogram frame.
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimension of the hidden representations.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        gradient_clipping (`float`, *optional*, defaults to 10000000000.0):
+            Clipping value used to stablize extremely large gradient values.
+        conf_attention_chunk_size (`int`, *optional*, defaults to 12):
+            The sub-sequence size for local attention processing inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_attention_context_left (`int`, *optional*, defaults to 13):
+            The left context size of the local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_attention_context_right (`int`, *optional*, defaults to 0):
+            The right context size of the local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_attention_logit_cap (`float`, *optional*, defaults to 50.0):
+            Logit cap applied during local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_num_attention_heads (`int`, *optional*, defaults to 8):
+            The number of attention heads in local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_num_hidden_layers (`int`, *optional*, defaults to 12):
+            The number of layers that use local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_conv_kernel_size (`int`, *optional*, defaults to 5):
+            Convolution kernel size for the conformer block inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_reduction_factor (`int`, *optional*, defaults to 4):
+            Reduction factor used in the conformer block inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_residual_weight (`float`, *optional*, defaults to 0.5):
+            Residual connection weight inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`):
+            The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection
+            ("sscp") section of the Universal Speech Model.
+        sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001):
+            Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution
+            Projection ("sscp") section of the Universal Speech Model.
+        sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`):
+            Kernel sizes of the two convolutional layers in the subsample convolution projection  in the Sub-sample
+            Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a
+            tuple of height and width for each layer, where the height corresponds to the time dimension and the width
+            corresponds to the frequency dimension.
+        sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`):
+            Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample
+            Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a
+            tuple of height and width for each layer, where the height corresponds to the time dimension and the width
+            corresponds to the frequency dimension.
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder
+
+    >>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration
+    >>> configuration = Gemma3nAudioConfig()
+
+    >>> # Initializing a model from the gemma3n_audio-E4B style configuration
+    >>> model = Gemma3nAudioEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3n_audio"
+
+    def __init__(
+        self,
+        vocab_size: int = 128,
+        vocab_offset: int = 262_144 + 128,  # text vocab size + vision vocab size
+        input_feat_size: int = 128,
+        hidden_size: int = 1536,
+        rms_norm_eps: float = 1e-6,
+        gradient_clipping: float = 10_000_000_000.0,
+        conf_attention_chunk_size: int = 12,
+        conf_attention_context_left: int = 13,
+        conf_attention_context_right: int = 0,
+        conf_attention_logit_cap: float = 50.0,
+        conf_num_attention_heads: int = 8,
+        conf_num_hidden_layers: int = 12,
+        conf_conv_kernel_size: int = 5,
+        conf_reduction_factor: int = 4,
+        conf_residual_weight: float = 0.5,
+        sscp_conv_channel_size: tuple[int, int] = (128, 32),
+        sscp_conv_group_norm_eps: float = 1e-3,
+        sscp_conv_kernel_size: tuple[tuple[int, int], tuple[int, int]] = (
+            (3, 3),
+            (3, 3),
+        ),
+        sscp_conv_stride_size: tuple[tuple[int, int], tuple[int, int]] = (
+            (2, 2),
+            (2, 2),
+        ),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_feat_size = input_feat_size
+        self.hidden_size = hidden_size
+        self.rms_norm_eps = rms_norm_eps
+        self.vocab_size = vocab_size
+        self.vocab_offset = vocab_offset
+        self.gradient_clipping = gradient_clipping
+        self.conf_attention_chunk_size = conf_attention_chunk_size
+        self.conf_attention_context_left = conf_attention_context_left
+        self.conf_attention_context_right = conf_attention_context_right
+        self.conf_attention_logit_cap = conf_attention_logit_cap
+        self.conf_num_attention_heads = conf_num_attention_heads
+        self.conf_num_hidden_layers = conf_num_hidden_layers
+        self.conf_conv_kernel_size = conf_conv_kernel_size
+        self.conf_reduction_factor = conf_reduction_factor
+        self.conf_residual_weight = conf_residual_weight
+        self.sscp_conv_channel_size = sscp_conv_channel_size
+        self.sscp_conv_group_norm_eps = sscp_conv_group_norm_eps
+        self.sscp_conv_kernel_size = sscp_conv_kernel_size
+        self.sscp_conv_stride_size = sscp_conv_stride_size
+
+
+class Gemma3nVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration for a timm backbone [`TimmWrapper`]. It is used to
+    instantiate an timm model model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B
+    vision tower, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+
+    Configuration objects inherit from [`Gemma3nVisionConfig`] and can be used to control the model outputs. Read the
+    documentation from [`Gemma3nVisionConfig`] for more information.
+
+    Config loads imagenet label descriptions and stores them in `id2label` attribute, `label2id` attribute for default
+    imagenet models is set to `None` due to occlusions in the label descriptions.
+
+    Args:
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        do_pooling (`bool`, *optional*, defaults to `False`):
+            Whether to do pooling for the last_hidden_state in `TimmWrapper` or not.
+        architecture (`str`, *optional*, defaults to `"mobilenetv5_300m_enc"`):
+            Determines vision architecture for TimmWrapper.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        vocab_size (`int`, *optional*, defaults to 128):
+            Vocabulary size of the additional hard-token embeddings for vision model.
+        vocab_offset (`int`, *optional*, defaults to 262144):
+            Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
+            0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+
+    Example:
+    ```python
+    >>> from transformers import Gemma3nVisionConfig, TimmWrapper
+
+    >>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration
+    >>> configuration = Gemma3nVisionConfig()
+
+    >>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration
+    >>> model = TimmWrapper(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3n_vision"
+
+    def __init__(
+        self,
+        initializer_range: float = 0.02,
+        do_pooling: bool = False,
+        architecture: str = "mobilenetv5_300m_enc",
+        hidden_size: int = 2048,
+        vocab_size: int = 128,
+        vocab_offset: int = 262_144,
+        rms_norm_eps: float = 1e-06,
+        model_args: Optional[dict] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.initializer_range = initializer_range
+        self.do_pooling = do_pooling
+        self.model_args = model_args  # named "model_args" for BC with timm
+        self.architecture = architecture
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.vocab_offset = vocab_offset
+        self.rms_norm_eps = rms_norm_eps
+
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs):
+        label_names = config_dict.get("label_names")
+        is_custom_model = "num_labels" in kwargs or "id2label" in kwargs
+
+        # if no labels added to config, use imagenet labeller in timm
+        if label_names is None and not is_custom_model:
+            requires_backends(cls, ["timm"])
+            imagenet_subset = infer_imagenet_subset(config_dict)
+            if imagenet_subset:
+                dataset_info = ImageNetInfo(imagenet_subset)
+                synsets = dataset_info.label_names()
+                label_descriptions = dataset_info.label_descriptions(as_dict=True)
+                label_names = [label_descriptions[synset] for synset in synsets]
+
+        if label_names is not None and not is_custom_model:
+            kwargs["id2label"] = dict(enumerate(label_names))
+
+            # if all label names are unique, create label2id mapping as well
+            if len(set(label_names)) == len(label_names):
+                kwargs["label2id"] = {name: i for i, name in enumerate(label_names)}
+            else:
+                kwargs["label2id"] = None
+
+        # timm config stores the `num_classes` attribute in both the root of config and in the "pretrained_cfg" dict.
+        # We are removing these attributes in order to have the native `transformers` num_labels attribute in config
+        # and to avoid duplicate attributes
+        num_labels_in_kwargs = kwargs.pop("num_labels", None)
+        num_labels_in_dict = config_dict.pop("num_classes", None)
+
+        # passed num_labels has priority over num_classes in config_dict
+        kwargs["num_labels"] = num_labels_in_kwargs or num_labels_in_dict
+
+        # pop num_classes from "pretrained_cfg",
+        # it is not necessary to have it, only root one is used in timm
+        if "pretrained_cfg" in config_dict and "num_classes" in config_dict["pretrained_cfg"]:
+            config_dict["pretrained_cfg"].pop("num_classes", None)
+
+        return super().from_dict(config_dict, **kwargs)
+
+    def to_dict(self) -> dict[str, Any]:
+        output = super().to_dict()
+        output["num_classes"] = self.num_labels
+        output["label_names"] = list(self.id2label.values())
+        output.pop("id2label", None)
+        output.pop("label2id", None)
+        return output
+
+
+class Gemma3nConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3nForConditionalGeneration`]. It is used to
+    instantiate a Gemma3nForConditionalGeneration according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Gemma3n-E4B.
+
+    e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3nTextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        audio_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom audio config or dict.
+        audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
+            The number of soft tokens per audio clip.
+        vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of soft tokens per image.
+        boi_token_id (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_id (`int`, *optional*, defaults to 262144):
+            The end-of-image token index to wrap the image prompt.
+        image_token_id (`int`, *optional*, defaults to 262145):
+            The image token index to encode the image prompt.
+        boa_token_id (`int`, *optional*, defaults to 256000):
+            The begin-of-audio token index to wrap the audio prompt.
+        eoa_token_id (`int`, *optional*, defaults to 262272):
+            The end-of-audio token index to wrap the audio prompt.
+        audio_token_id (`int`, *optional*, defaults to 262273):
+            The audio token index to encode the audio prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig
+
+    >>> # Initializing a MobileNet vision config, which is loaded from TIMM
+    >>> vision_config = Gemma3nVisionConfig()
+
+    >>> # Initializing a Gemma3n Audio config
+    >>> audio_config = Gemma3nAudioConfig()
+
+    >>> # Initializing a Gemma3n Text config
+    >>> text_config = Gemma3nTextConfig()
+
+    >>> # Initializing a Gemma3n gemma-3-4b style configuration
+    >>> configuration = Gemma3nConfig(text_config, vision_config, audio_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3nTextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3n"
+    sub_configs = {
+        "text_config": Gemma3nTextConfig,
+        "vision_config": Gemma3nVisionConfig,
+        "audio_config": Gemma3nAudioConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3nTextConfig, dict[str, Any]]] = None,
+        vision_config: Optional[Union[Gemma3nVisionConfig, dict[str, Any]]] = None,
+        audio_config: Optional[Union[Gemma3nAudioConfig, dict[str, Any]]] = None,
+        audio_soft_tokens_per_image: int = 188,
+        vision_soft_tokens_per_image: int = 256,
+        boi_token_id: int = 255_999,
+        eoi_token_id: int = 262_144,
+        image_token_id: int = 262_145,
+        boa_token_id: int = 256_000,
+        eoa_token_id: int = 262_272,
+        audio_token_id: int = 262_273,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(text_config, dict):
+            text_config = Gemma3nTextConfig(**text_config)
+        elif text_config is None:
+            text_config = Gemma3nTextConfig()
+            logger.info("text_config is None. Using default Gemma3nTextConfig.")
+
+        if isinstance(vision_config, dict):
+            vision_config = Gemma3nVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = Gemma3nVisionConfig()
+            logger.info("vision_config is None. Using default Gemma3nVisionConfig.")
+
+        if isinstance(audio_config, dict):
+            audio_config = Gemma3nAudioConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = Gemma3nAudioConfig()
+            logger.info("audio_config is None. Using default Gemma3nAudioConfig.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.audio_config = audio_config
+
+        self.audio_soft_tokens_per_image = audio_soft_tokens_per_image
+        self.vision_soft_tokens_per_image = vision_soft_tokens_per_image
+        self.boi_token_id = boi_token_id
+        self.eoi_token_id = eoi_token_id
+        self.image_token_id = image_token_id
+        self.boa_token_id = boa_token_id
+        self.eoa_token_id = eoa_token_id
+        self.audio_token_id = audio_token_id
+        self.initializer_range = initializer_range
+
+
+__all__ = ["Gemma3nAudioConfig", "Gemma3nConfig", "Gemma3nTextConfig", "Gemma3nVisionConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.py b/phivenv/Lib/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.py
new file mode 100644
index 0000000000000000000000000000000000000000..4acab69cb1bb4d2144d49bac347542a5159d1a61
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Sequence
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def create_fb_matrix(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_mels: int,
+    sample_rate: int,
+    fft_length: int,
+    norm: Optional[str] = None,
+) -> np.ndarray:
+    r"""Create a frequency bin conversion matrix (NumPy version).
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_mels (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        fft_length (int): FFT length
+        norm (Optional[str]): If 'slaney', divide the triangular mel weights by
+          the width of the mel band (area normalization). (Default: ``None``)
+
+    Returns:
+        np.ndarray: Triangular filter banks (fb matrix) of size (``n_freqs``,
+        ``n_mels``)
+        meaning number of frequencies to highlight/apply to x the number of
+        filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A @ create_fb_matrix_numpy(A.shape[-1], ...)``.
+    """
+
+    if norm is not None and norm != "slaney":
+        raise ValueError("norm must be one of None or 'slaney'")
+
+    # freq bins
+    all_freqs = np.arange(n_freqs, dtype=np.float32) * (sample_rate / fft_length)
+
+    # calculate mel freq bins
+    # hertz to mel(f) is 2595. * math.log10(1. + (f / 700.))
+    m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
+    m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
+    m_pts = np.linspace(m_min, m_max, n_mels + 2)
+    # mel to hertz(mel) is 700. * (10**(mel / 2595.) - 1.)
+    f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
+    # calculate difference between each mel point and each stft freq point in Hz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
+    slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (n_freqs, n_mels + 2)
+    # create overlapping triangles
+    zero = np.zeros(1, dtype=np.float32)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
+    fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        fb *= np.expand_dims(enorm, 0)
+
+    return fb
+
+
+def _unfold(array: np.ndarray, dimension: int, size: int, step: int) -> np.ndarray:
+    """A basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim."""
+    if array.ndim != 2:
+        raise ValueError("This unfold implementation currently supports 2D arrays (batch, time).")
+    if dimension != -1 and dimension != array.ndim - 1:
+        raise ValueError("This unfold implementation only supports unfolding the last dimension.")
+
+    batch_size, original_length = array.shape
+    num_frames = (original_length - size) // step + 1
+
+    if num_frames <= 0:
+        return np.zeros((batch_size, 0, size), dtype=array.dtype)
+
+    output_shape = (batch_size, num_frames, size)
+    output_strides = (array.strides[0], array.strides[1] * step, array.strides[1])
+
+    return np.lib.stride_tricks.as_strided(array, shape=output_shape, strides=output_strides)
+
+
+class Gemma3nAudioFeatureExtractor(SequenceFeatureExtractor):
+    """An audio feature extractor Universal Speech Models https://huggingface.co/papers/2303.01037.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 128):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+        return_attention_mask (`bool`, *optional*, defaults to `True`):
+            Whether to return the attention mask for the generated MEL spectrograms.
+        frame_length_ms (`float`, *optional*, defaults to 32.0):
+            The length of a frame in milliseconds.
+        hop_length_ms (`float`, *optional*, defaults to 10.0):
+            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
+        min_frequency (`float`, *optional*, defaults to 125.0):
+            The minimum frequency (in Hz) for the Mel filterbank.
+        max_frequency (`float`, *optional*, defaults to 7600.0):
+            The maximum frequency (in Hz) for the Mel filterbank.
+        preemphasis (`float`, *optional*, defaults to 0.97):
+            The preemphasis coefficient.
+        preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`):
+            Whether to use HTK-style preemphasis.
+        fft_overdrive (`bool`, *optional*, defaults to `True`):
+            Whether to use FFT overdrive.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 0.0001 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
+            The value 0.0 means no dithering.
+            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
+            the high log_mel_fbank values for signals with hard-zero sections,
+            when VAD cutoff is present in the signal.
+        input_scale_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor applied to the input waveform.
+        mel_floor (`float`, *optional*, defaults to 1e-05):
+            Minimum value for Mel spectrograms to avoid log(0).
+        per_bin_mean (`Optional[Sequence[float]]`, *optional*):
+            Mean values for per-bin normalization.
+        per_bin_stddev (`Optional[Sequence[float]]`, *optional*):
+            Standard deviation values for per-bin normalization.
+    """
+
+    model_input_names = ["input_features", "input_features_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 128,
+        sampling_rate: int = 16_000,
+        padding_value: float = 0.0,
+        return_attention_mask: bool = True,
+        frame_length_ms: float = 32.0,
+        hop_length_ms: float = 10.0,
+        min_frequency: float = 125.0,
+        max_frequency: float = 7600.0,
+        preemphasis: float = 0.97,
+        preemphasis_htk_flavor: bool = True,
+        fft_overdrive: bool = True,
+        dither: float = 0.0,
+        input_scale_factor: float = 1.0,
+        mel_floor: float = 1e-5,
+        per_bin_mean: Optional[Sequence[float]] = None,
+        per_bin_stddev: Optional[Sequence[float]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+
+        self.min_frequency = min_frequency
+        self.max_frequency = max_frequency
+        self.preemphasis = preemphasis
+        self.preemphasis_htk_flavor = preemphasis_htk_flavor
+        self.fft_overdrive = fft_overdrive
+        self.dither = dither
+        self.input_scale_factor = input_scale_factor
+        self.frame_length = int(round(sampling_rate * frame_length_ms / 1000.0))
+        self.hop_length = int(round(sampling_rate * hop_length_ms / 1000.0))
+        self.mel_floor = np.array(mel_floor, dtype=np.float64)
+
+        fft_length = 2 ** math.ceil(math.log2(self.frame_length))
+        if self.fft_overdrive:
+            fft_length *= 2
+        self.fft_length = fft_length
+
+        hann_arange = np.arange(self.frame_length, dtype=np.float32)
+        window = 0.5 * (1 - np.cos(2 * np.pi * hann_arange / self.frame_length))
+        self.window = window.astype(np.float32)
+
+        self.mel_filters = create_fb_matrix(
+            n_freqs=self.fft_length // 2 + 1,
+            f_min=min_frequency,
+            f_max=max_frequency,
+            n_mels=feature_size,
+            sample_rate=self.sampling_rate,
+            norm=None,
+            fft_length=fft_length,
+        )
+
+        if per_bin_mean is not None:
+            self.per_bin_mean = np.array(per_bin_mean).reshape(1, 1, feature_size)
+        else:
+            self.per_bin_mean = None
+
+        if per_bin_stddev is not None:
+            self.per_bin_stddev = np.array(per_bin_stddev).reshape(1, 1, feature_size)
+        else:
+            self.per_bin_stddev = None
+
+    def _extract_spectrogram(self, waveform: np.ndarray, attention_mask: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        """"""
+        if waveform.ndim == 1:  # If single waveform, add batch dimension
+            waveform = np.expand_dims(waveform, axis=0)
+
+        if self.dither > 0.0:
+            waveform = waveform + self.dither * np.random.randn(*waveform.shape).astype(waveform.dtype)
+
+        if self.input_scale_factor != 1.0:
+            waveform = waveform * self.input_scale_factor
+
+        frame_size_for_unfold = self.frame_length + 1
+
+        # NumPy equivalent of unfold for [B, NumFrames, frame_size_for_unfold]
+        frames_to_process = _unfold(waveform, dimension=-1, size=frame_size_for_unfold, step=self.hop_length)
+
+        if self.preemphasis > 0.0:
+            if self.preemphasis_htk_flavor:
+                first_in_frame = frames_to_process[..., :1] * (1.0 - self.preemphasis)
+                rest_in_frame = frames_to_process[..., 1:-1] - self.preemphasis * frames_to_process[..., :-2]
+                frames = np.concatenate([first_in_frame, rest_in_frame], axis=-1)
+            else:
+                frames = frames_to_process[..., 1:] - self.preemphasis * frames_to_process[..., :-1]
+        else:
+            frames = frames_to_process[..., :-1]
+
+        frames = frames * self.window  # Broadcasting window
+        stft = np.fft.rfft(frames, n=self.fft_length, axis=-1)
+
+        magnitude_spec = np.abs(stft)
+
+        mel_spec = np.matmul(magnitude_spec, self.mel_filters)
+        log_mel_spec = np.log(np.maximum(mel_spec, self.mel_floor))
+
+        if self.per_bin_mean is not None:
+            log_mel_spec = log_mel_spec - self.per_bin_mean  # Broadcasting
+
+        if self.per_bin_stddev is not None:
+            log_mel_spec = log_mel_spec / self.per_bin_stddev  # Broadcasting
+
+        mel_spectrogram = log_mel_spec.squeeze(0)
+        mask = attention_mask[:: self.hop_length].astype(bool)
+        # TODO: The filtered mask is always exactly 3 elements longer than the mel_spectrogram. Why???
+        return mel_spectrogram, mask[: mel_spectrogram.shape[0]]
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Union[bool, str, PaddingStrategy] = "longest",
+        max_length: Optional[int] = 480_000,
+        truncation: bool = True,
+        pad_to_multiple_of: Optional[int] = 128,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        **kwargs,
+    ) -> BatchFeature:
+        """Creates a batch of MEL spectrograms from the provided raw speech.
+
+        This implementation uses a different algorithm for windowing and preemphasis compared to the built-in
+        `transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this
+        carefully when selecting an audio feature extactor, especially with pre-trained models.
+
+        Args:
+            raw_speech:
+                The audio for which MEL spectrograms are created.
+            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`):
+                The padding strategy to use for batches of audio with different lengths.
+            max_length (`int`, *optional*, defaults to 480000):
+                If provided, defines the maximum length of the audio to allow. Audio longer than this will be
+                truncated if `truncation=True`.
+            truncation (`bool`, *optional*, defaults to `True`):
+                Whether or not to truncate audio above `max_length`.
+            pad_to_multiple_of (`int`, *optional*, defaults to 128):
+                When padding, pad to a multiple of this value. The default value is defined for optimal TPU support.
+            return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`):
+                The type of tensors to return (e.g., NumPy, Torch, JAX, TensorFlow).
+            return_attention_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return the attention mask for the generated MEL spectrograms.
+        """
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        is_batched_sequence = isinstance(raw_speech, Sequence) and isinstance(raw_speech[0], (np.ndarray, Sequence))
+        is_batched = is_batched_numpy or is_batched_sequence
+
+        if is_batched:
+            raw_speech = [np.asarray([rs]).T for rs in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech)
+
+        if not is_batched:  # always return a batch
+            raw_speech = [np.asarray([raw_speech])]
+
+        batched_speech = self.pad(
+            BatchFeature({"input_features": raw_speech}),
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        prepared_speech = []
+        prepared_speech_mask = []
+        for speech, mask in zip(batched_speech.input_features, batched_speech.attention_mask):
+            speech, mask = self._extract_spectrogram(speech.T, mask)
+            prepared_speech.append(speech.astype(np.float32))
+            prepared_speech_mask.append(mask)
+
+        return BatchFeature(
+            {"input_features": prepared_speech, "input_features_mask": prepared_speech_mask},
+            tensor_type=return_tensors,
+        )
+
+
+__all__ = ["Gemma3nAudioFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/modeling_gemma3n.py b/phivenv/Lib/site-packages/transformers/models/gemma3n/modeling_gemma3n.py
new file mode 100644
index 0000000000000000000000000000000000000000..737d288f82b5dcc71456d872e5f621fe634995c9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3n/modeling_gemma3n.py
@@ -0,0 +1,2401 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3n/modular_gemma3n.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3n.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModel
+from .configuration_gemma3n import Gemma3nAudioConfig, Gemma3nConfig, Gemma3nTextConfig, Gemma3nVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3n outputs, with hidden states and attentions.
+    """
+)
+class Gemma3nModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    audio_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+    audio_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3n causal language model (or autoregressive) outputs.
+    """
+)
+class Gemma3nCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    audio_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+    audio_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Gemma3nRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6, with_scale: bool = True):
+        super().__init__()
+        self.eps = eps
+        self.with_scale = with_scale
+
+        if self.with_scale:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.register_buffer("weight", torch.tensor(1.0), persistent=False)
+
+    def _norm(self, x):
+        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = self._norm(x.float()) * self.weight.float()
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+# ==== Audio Encoder ====
+
+
+class Gemma3nAudioRelativePositionEmbedding(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.channels = self.config.hidden_size
+        self.head_dim = self.channels // self.num_heads
+        self.max_backward = max(0, self.config.conf_attention_context_left - 1)
+        self.max_forward = self.config.conf_attention_context_right
+
+        self.pos_proj = nn.Linear(self.channels, self.num_heads * self.head_dim, bias=False)
+
+        min_timescale = 1.0
+        max_timescale = 1.0e4
+        num_timescales = self.channels // 2
+        log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / max(num_timescales - 1, 1)
+        inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales) * -log_timescale_increment)
+        self.register_buffer(
+            "inv_timescales",
+            inv_timescales.float().unsqueeze(0).unsqueeze(0),
+            persistent=False,
+        )
+
+    def _get_timing_signal_1d_pos(self, position: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+        position = position.float().unsqueeze(-1)
+        scaled_time = position * self.inv_timescales.to(device=position.device, dtype=torch.float32)
+        timing_signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=-1)
+        return timing_signal.type(dtype)
+
+    def _relative_shift(
+        self,
+        term_bd_before_shift: torch.Tensor,
+        batch_size: int,
+        num_heads: int,
+        num_query_blocks: int,
+        query_block_size: int,
+        key_context_size: int,
+        max_span_plus_1: int,
+    ) -> torch.Tensor:
+        """Performs the relative shift.
+
+        Args:
+          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
+            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
+            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).
+
+        Returns:
+          Tensor of shape [B, N, U, W, C].
+        """
+        # term_bd_before_shift shape: [B, N, U, W, F_span]
+        # Target shape after shift:  [B, N, U, W, C]
+
+        # Padding amount for the last dimension (F_span) to become (C + 1)
+        # C = key_context_size
+        # F_span = max_span_plus_1
+        pad_amount_last_dim = (key_context_size + 1) - max_span_plus_1
+
+        # PyTorch F.pad expects (pad_left, pad_right, pad_top, pad_bottom ...)
+        # We only pad the last dimension on the right.
+        padding_tuple = (0, pad_amount_last_dim)
+
+        term_bd_padded = nn.functional.pad(term_bd_before_shift, padding_tuple)
+        # Shape after pad: [B, N, U, W, C+1]
+
+        # Reshape for slicing (emulating JAX's behavior)
+        # [B, N, U, W * (C+1)]
+        term_bd_reshaped = term_bd_padded.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size * (key_context_size + 1),
+            )
+        )
+
+        # Slice to effective [B, N, U, W * C]
+        term_bd_sliced = term_bd_reshaped[:, :, :, : query_block_size * key_context_size]
+
+        # Reshape back to [B, N, U, W, C]
+        term_bd_shifted = term_bd_sliced.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size,
+                key_context_size,
+            )
+        )
+        return term_bd_shifted
+
+    def forward(self, queries: torch.Tensor, keys: torch.Tensor) -> torch.Tensor:
+        # queries: [B, U, W, N, H] (batch, num_query_blocks, query_block_size, num_heads, head_dim)
+        # keys:    [B, U, C, N, H] (batch, num_query_blocks, key_context_size, num_heads, head_dim)
+        # C = W + L + R (key_context_size)
+        # F_span = L + R + 1 (max_span + 1)
+
+        batch_size, num_query_blocks, query_block_size, num_heads, head_dim = queries.shape
+        _, _, key_context_size, _, _ = keys.shape
+
+        # Relative positions for sinusoidal embeddings: [L, L-1, ..., -R]
+        # Length is L+R+1 = self.max_span + 1
+        pos_indices = torch.arange(self.max_backward, -self.max_forward - 1, -1, device=queries.device).unsqueeze(
+            0
+        )  # Shape [1, F_span]
+
+        max_span_plus_1 = pos_indices.shape[1]  # F_span
+
+        sin_emb_timing_signal = self._get_timing_signal_1d_pos(
+            pos_indices, dtype=queries.dtype
+        )  # Shape [1, F_span, self.channels]
+
+        # Project sinusoidal embeddings: [1, F_span, self.channels] -> [1, F_span, N*H]
+        projected_sin_emb = self.pos_proj(sin_emb_timing_signal)
+        # Reshape to [1, F_span, N, H] then squeeze to [F_span, N, H]
+        sin_emb = projected_sin_emb.reshape(1, max_span_plus_1, self.num_heads, self.head_dim).squeeze(
+            0
+        )  # Shape [F, N, H]
+
+        # term_ac: Query-Key content interaction
+        # queries: [B, U, W, N, H] -> permute to [B, N, U, W, H] for matmul
+        # keys:    [B, U, C, N, H] -> permute to [B, N, U, H, C] for matmul
+        queries_p = queries.permute(0, 3, 1, 2, 4)  # [B, N, U, W, H]
+        keys_p_t = keys.permute(0, 3, 1, 4, 2)  # [B, N, U, H, C]
+        term_ac = torch.matmul(queries_p, keys_p_t)  # [B, N, U, W, C]
+
+        # term_bd: Query-Position interaction
+        # Original einsum: term_bd_unshifed = torch.einsum('buwnh,fnh->bnuwf', queries, sin_emb)
+        # queries shape: [B, U, W, N, H]
+        # sin_emb shape: [F, N, H]
+        # Target output shape: [B, N, U, W, F]
+
+        # Permute queries to [B, N, U, W, H] for easier broadcasting with sin_emb
+        q_permuted = queries.permute(0, 3, 1, 2, 4)
+
+        # Permute sin_emb to [N, H, F] to prepare for matmul
+        # sin_emb original is [F, N, H]
+        s_permuted = sin_emb.permute(1, 2, 0)  # Shape: [N, H, F]
+
+        # Reshape queries for matmul: [B, N, U*W, H]
+        q_reshaped = q_permuted.reshape(batch_size, num_heads, num_query_blocks * query_block_size, head_dim)
+
+        # Perform matmul: [B, N, U*W, H] @ [N, H, F]
+        # s_permuted ([N, H, F]) will be broadcast to [B, N, H, F]
+        # Result: [B, N, U*W, F]
+        term_bd_unshifed_matmul = torch.matmul(q_reshaped, s_permuted)
+
+        # Reshape to target [B, N, U, W, F]
+        term_bd_unshifed = term_bd_unshifed_matmul.reshape(
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            max_span_plus_1,
+        )
+
+        # Apply relative shift to term_bd_unshifed
+        term_bd_shifted = self._relative_shift(
+            term_bd_unshifed,
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            key_context_size,
+            max_span_plus_1,
+        )  # Shape [B, N, U, W, C]
+
+        return term_ac + term_bd_shifted
+
+
+class Gemma3nAudioAttention(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.hidden_size = self.config.hidden_size
+        self.head_dim = self.hidden_size // self.num_heads
+
+        self.chunk_size = self.config.conf_attention_chunk_size
+        self.max_future_horizon = self.config.conf_attention_context_right
+        self.max_past_horizon = max(0, self.config.conf_attention_context_left - 1)
+        self.attention_logits_soft_cap = self.config.conf_attention_logit_cap
+        self.context_size = self.chunk_size + self.max_past_horizon + self.max_future_horizon
+
+        self.relative_position_embedding = Gemma3nAudioRelativePositionEmbedding(config)
+        self.per_dim_scale = nn.Parameter(torch.zeros((self.head_dim,)))
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+
+        q_scale = self.head_dim**-0.5
+        r_softplus_0 = 1.0 / torch.nn.functional.softplus(torch.tensor(0.0))
+        self.register_buffer("q_scale", (q_scale * r_softplus_0).clone().detach(), persistent=False)
+
+        lower_causal_mask = torch.tril(
+            torch.ones((self.context_size, self.chunk_size), dtype=torch.bool),
+            diagonal=0,
+        ).T
+        upper_causal_mask = torch.tril(
+            torch.ones((self.chunk_size, self.context_size), dtype=torch.bool),
+            diagonal=self.max_past_horizon + self.max_future_horizon,
+        )
+        local_causal_valid_mask = torch.ones((self.chunk_size, self.context_size), dtype=torch.bool)
+        local_causal_valid_mask = local_causal_valid_mask * lower_causal_mask * upper_causal_mask
+        self.register_buffer("local_causal_valid_mask", local_causal_valid_mask, persistent=False)
+
+        self.register_buffer(
+            "softcap",
+            torch.tensor(self.attention_logits_soft_cap).float(),
+            persistent=False,
+        )
+
+    def _pad_dim1(self, x: torch.Tensor, pad_left: int, pad_right: int) -> torch.Tensor:
+        batch, _, *tail_shape = x.shape
+        left = x.new_zeros((batch, pad_left, *tail_shape))
+        right = x.new_zeros((batch, pad_right, *tail_shape))
+        x = torch.cat([left, x, right], dim=1)
+        return x
+
+    def _convert_to_block(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Turns a sequence to non overlapping blocks.
+
+        Args:
+            hidden_states: a tensor of [batch, time, ...].
+
+        Returns:
+            A tensor of [batch, num_blocks, block_size, ...], with necessary
+            paddings,
+            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
+        """
+        shape = hidden_states.shape
+        b, t = shape[:2]
+        num_blocks = (t + self.chunk_size - 1) // self.chunk_size
+
+        if (padding_len := num_blocks * self.chunk_size - t) > 0:
+            hidden_states = self._pad_dim1(hidden_states, 0, padding_len)
+
+        permute_dims = (b, num_blocks, self.chunk_size) + shape[2:]
+        hidden_states = hidden_states.reshape(permute_dims).contiguous()
+        return hidden_states
+
+    def _extract_block_context(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Extracts temporal context for every block.
+
+        Args:
+            hidden_states: a tensor of [batch, time, ...].
+
+        Returns:
+            A tensor of [batch, num_blocks, context_size, ...], with necessary
+            paddings,
+            where context_size = block_size + left_context + right_context,
+            and output[:, i, ...] are x[:, start-left_context:end+right_context,
+            ...],
+            start = i * block_size, end = (i + 1) * block_size.
+        """
+        pad_left = self.max_past_horizon
+        # The JAX equivalent padding for signal.frame with pad_mode='valid' is
+        # (left_context, right_context + block_size - 1) on the time dimension.
+        # PyTorch's _pad_dim1 applies padding symmetrically if only one value is given,
+        # or (pad_dim_start, pad_dim_end) if two are given.
+        # Our _pad_dim1(x, pad_left, pad_right) pads dim -2 (time for [B,T,N,H])
+        # or dim 1 (time for [B,T]).
+        # The current pad_right calculation matches the JAX effective padding.
+        pad_right = self.max_future_horizon + self.chunk_size - 1
+        hidden_states = self._pad_dim1(hidden_states, pad_left, pad_right)
+
+        frame_len = self.context_size
+        frame_step = self.chunk_size
+
+        # Directly use unfold without the subframe_factor logic
+        # x.unfold(dimension, size, step)
+        # dimension=1 (time dimension, assuming x is [B, T_padded, ...])
+        # size=frame_len (context_size)
+        # step=frame_step (chunk_size)
+        x_unfolded = hidden_states.unfold(dimension=1, size=frame_len, step=frame_step)
+
+        # If x was [B, T_padded], x_unfolded is [B, num_blocks, frame_len]
+        # If x was [B, T_padded, N, H], x_unfolded is [B, num_blocks, N, H, frame_len]
+        # We want to match JAX's typical output for such operations which might be
+        # [B, num_blocks, frame_len, N, H] if N, H are present.
+        # The relative_position_embedding expects keys as [B, U, C, N, H].
+        # If x_unfolded is [B, U, N, H, C(frame_len)], we need to move C.
+        if hidden_states.ndim > 2 and x_unfolded.ndim > 3:  # Check if inner dimensions (like N, H) exist
+            # Current shape after unfold for [B, T_pad, N, H] is [B, U, N, H, C]
+            # Target shape for keys in RPE: [B, U, C, N, H]
+            x_unfolded = torch.movedim(x_unfolded, source=-1, destination=2)
+
+        return x_unfolded.contiguous()
+
+    def forward(self, hidden_states: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+        # sl.Dense uses jax.numpy.einsum("...a,abcd->...bcd") and jax.numpy.select()
+        qkv_shape = (*hidden_states.shape[:-1], self.num_heads, self.head_dim)
+        query_states = self.q_proj(hidden_states).reshape(qkv_shape).contiguous()
+        key_states = self.k_proj(hidden_states).reshape(qkv_shape).contiguous()
+        value_states = self.v_proj(hidden_states).reshape(qkv_shape).contiguous()
+
+        per_dim_scale_sp = torch.nn.functional.softplus(self.per_dim_scale)
+
+        broadcast_shape = (1, 1, 1, self.head_dim)
+        per_dim_scale_sp_broadcast = per_dim_scale_sp.view(broadcast_shape)
+        query_states = query_states * self.q_scale * per_dim_scale_sp_broadcast
+
+        batch_size, q_time = query_states.shape[:2]
+
+        query_blocks = self._convert_to_block(query_states)
+        key_blocks = self._extract_block_context(key_states)
+        value_blocks = self._extract_block_context(value_states)
+        num_query_blocks = query_blocks.shape[1]
+
+        # 1. Create a mask indicating originally valid positions.
+        original_valid_mask = ~mask  # True for valid, False for padded
+
+        # 2. Extract blocks from this validity mask.
+        extracted_valid_mask_blocks = self._extract_block_context(original_valid_mask)
+
+        # If subframe_factor was used in _extract_block_context for a [B, T] input mask,
+        # the shape might be [B, U, C/SF, SF]. Reshape to [B, U, C].
+        # batch_size and num_query_blocks are known from query_blocks.
+        # self.context_size is C.
+        if (
+            extracted_valid_mask_blocks.ndim == 4
+            and extracted_valid_mask_blocks.shape[2] * extracted_valid_mask_blocks.shape[3] == self.context_size
+        ):
+            extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape(
+                batch_size, num_query_blocks, self.context_size
+            )
+        # After potential reshape, ensure it's [B, U, C] if it was from a [B,T] mask.
+        # This assertion might be too strict if _extract_block_context handles higher-rank inputs differently,
+        # but for the mask case, this should hold.
+        if extracted_valid_mask_blocks.shape != (
+            batch_size,
+            num_query_blocks,
+            self.context_size,
+        ):
+            raise ValueError(
+                "Shape of extracted_valid_mask_blocks"
+                f" {extracted_valid_mask_blocks.shape} is not ({batch_size},"
+                f" {num_query_blocks}, {self.context_size}) after potential reshape."
+            )
+
+        # 3. Expand dimensions for broadcasting with logits and causal mask.
+        # Target shape for broadcasting with logits [B,N,U,W,C]
+        # extracted_valid_mask_blocks to [B, 1, U, 1, C]
+        condition_from_input_validity = extracted_valid_mask_blocks.unsqueeze(1).unsqueeze(-2)
+
+        # self.local_causal_valid_mask is [W, C], True where allowed by local window.
+        # Expand to [1, 1, 1, W, C]
+        condition_from_causality = self.local_causal_valid_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+
+        # 4. Combine the two conditions.
+        # final_condition will be True where a key is *both* originally valid *and* causally accessible.
+        # Broadcasts to [B, 1, U, W, C]
+        final_condition_for_where = torch.logical_and(
+            condition_from_input_validity,
+            condition_from_causality.to(condition_from_input_validity.device),  # Ensure same device
+        )
+
+        # Embed queries and keys
+        logits = self.relative_position_embedding(query_blocks, key_blocks)
+
+        # Apply attention logit softcap
+        # Ensure softcap is on the same device as logits
+        softcap_val = self.softcap.to(logits.device)
+        logits = logits / softcap_val
+        logits = torch.tanh(logits)
+        logits = logits * softcap_val
+
+        # Apply the combined mask.
+        # final_condition_for_where will broadcast with logits [B,N,U,W,C]
+        logits = torch.where(final_condition_for_where, logits, torch.finfo(logits.dtype).min)
+        probabilities = torch.nn.functional.softmax(logits, dim=-1, dtype=torch.float32).to(dtype=value_blocks.dtype)
+
+        # context_vectors is adapted from jax.numpy.einsum("BNuwc,BucNH->BuwNH", ...)
+        b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape
+        h_dim = value_blocks.shape[-1]
+        prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim)
+        v_bun = value_blocks.permute(0, 1, 3, 2, 4).reshape(-1, c_dim, h_dim)
+        result_bmm = torch.bmm(prob_bun, v_bun)
+        context_vectors = result_bmm.reshape(b_dim, u_dim, n_dim, w_dim, h_dim).permute(0, 1, 3, 2, 4)
+        context_vectors = context_vectors.reshape(
+            (
+                batch_size,
+                num_query_blocks * self.chunk_size,
+                self.num_heads,
+                self.head_dim,
+            )
+        )
+        context_vectors = context_vectors[:, :q_time]
+
+        return context_vectors
+
+
+class Gemma3nAudioCumulativeGroupNorm(nn.Module):
+    """Applies Group Normalization cumulatively over the time dimension.
+
+    This layer normalizes the input by calculating the mean and variance
+    cumulatively over the time dimension (dim 1). The statistics are computed
+    over all feature dimensions (specified by `feature_dims` and `num_channels`)
+    for elements marked as valid by the optional `mask`.
+
+    If a `mask` is provided (True for valid, False for invalid/padded),
+    invalid time steps do not contribute to the statistics calculation, and
+    their corresponding output values are zeroed out.
+
+    Scale and bias, if enabled, are applied per-channel (last dimension).
+    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
+    and `cumulative=True`.
+    """
+
+    def __init__(
+        self,
+        num_channels: int,  # Number of channels (size of the last dimension)
+        feature_dims: Sequence[int],  # Sizes of non-channel feature dimensions, e.g., (H, W) for input [B,T,H,W,C]
+        eps: float = 1e-3,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.feature_dims = tuple(feature_dims)
+        self.eps = eps
+
+        # Scale parameter depends only on the channel dimension
+        self.weight = nn.Parameter(torch.ones(num_channels))
+
+        # Axes for normalization: all dimensions except Batch (0) and Time (1).
+        # For input [B, T, *feature_dims, C], these are dims from 2 onwards.
+        self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Applies cumulative group norm, optionally using a mask.
+
+        Args:
+          hidden_states: Input tensor, shape [B, T, *feature_dims, C].
+
+        Returns:
+          Normalized tensor with the same shape as x.
+        """
+        expected_input_suffix = self.feature_dims + (self.num_channels,)
+        if hidden_states.shape[2:] != expected_input_suffix:
+            raise ValueError(
+                f"Input tensor shape suffix {hidden_states.shape[2:]} does not match expected"
+                f" suffix (feature_dims + num_channels) {expected_input_suffix}"
+            )
+
+        input_dtype = hidden_states.dtype
+        # Calculations are performed in float32 for numerical stability.
+        calc_dtype = torch.float32
+        x_calc = hidden_states.to(calc_dtype)
+
+        # Prepare a broadcastable mask (`mask_calc`).
+        # If no mask is provided, treat all elements as valid
+        # (mask_calc is all ones).
+        # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting.
+        mask_calc = torch.ones_like(x_calc, dtype=calc_dtype)
+
+        # Cumulative Statistics Calculation
+        # 1. Sum of values over reduction axes at each time step.
+        sum_values_at_t = torch.sum(x_calc, dim=self.reduction_axes, keepdim=True)
+        # 2. Cumulative sum of values over time.
+        cum_sum_values = torch.cumsum(sum_values_at_t, dim=1)
+
+        # 3. Count of valid elements in the normalization group at each time step.
+        #    (A "group" here consists of all features at a given Batch, Time).
+        elements_in_group_at_t = torch.sum(mask_calc, dim=self.reduction_axes, keepdim=True)
+        # 4. Cumulative count of valid elements over time.
+        cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1)
+        # Avoid division by zero if all preceding elements were masked.
+        safe_cum_count_elements = torch.clamp(cum_count_elements, min=1.0)
+
+        # 5. Cumulative mean.
+        cum_mean = cum_sum_values / safe_cum_count_elements
+
+        # 6. Sum of squared differences from the cumulative mean.
+        #    Only sum for valid elements: (x_calc - cum_mean)^2 * mask_calc.
+        #    Using x_calc here for the difference, as cum_mean already accounts for masking.
+        squared_diff_from_mean = (x_calc - cum_mean).pow(2)
+        sum_sq_diff_at_t = torch.sum(squared_diff_from_mean, dim=self.reduction_axes, keepdim=True)
+
+        # 7. Cumulative sum of squared differences over time.
+        cum_sum_sq_diff = torch.cumsum(sum_sq_diff_at_t, dim=1)
+
+        # 8. Cumulative variance.
+        cum_variance = cum_sum_sq_diff / safe_cum_count_elements
+
+        # Normalize the input using the calculated cumulative statistics:
+        # (x - E[x]) / sqrt(Var[x] + eps)
+        normalized_x = (x_calc - cum_mean) * torch.rsqrt(cum_variance + self.eps)
+
+        # Apply affine transformation (scale and bias) if enabled.
+        # Scale and bias are applied per-channel (last dimension).
+        scale = self.weight.to(calc_dtype)
+        # Reshape for broadcasting: [C] -> [1, ..., 1, C]
+        scale_view_shape = [1] * (hidden_states.dim() - 1) + [self.num_channels]
+        normalized_x = normalized_x * scale.view(scale_view_shape)
+
+        # Zero out outputs for time steps that were originally masked (where mask_calc is 0).
+        # This ensures padded/invalid positions in the input result in zero output.
+        final_output = normalized_x * mask_calc
+
+        return final_output.to(input_dtype)
+
+
+class Gemma3nAudioSSCPConvBlock(nn.Module):
+    """A single convolution block for the SubSampleConvProjection.
+
+    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
+    and a ReLU activation. It handles manual padding for the convolution.
+    """
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        idx: int,
+        input_freq_dim: int,  # Changed from input_spatial_dim
+        manual_padding: tuple[int, int, int, int] = (0, 0, 0, 0),
+    ):
+        super().__init__()
+        self.config = config
+        self.manual_padding = manual_padding
+
+        # in_channels is 1 for the first block, or C_out from previous block's conv
+        in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1]
+        out_channels = self.config.sscp_conv_channel_size[idx]
+        kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx]
+        stride_h, stride_w = self.config.sscp_conv_stride_size[idx]
+
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(
+                kernel_h,
+                kernel_w,
+            ),  # Kernel (kH, kW) operates on (Time, Freq_dim)
+            stride=(stride_h, stride_w),
+            padding=(0, 0),  # Manual padding is used
+            bias=False,
+        )
+
+        # Calculate output frequency dimension (f_out_conv) after this convolution.
+        # input_freq_dim is the unpadded width (feature dimension).
+        # self.manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom)
+        f_in_padded = input_freq_dim + self.manual_padding[0] + self.manual_padding[1]
+        f_out_conv = (f_in_padded - kernel_w) // stride_w + 1
+
+        self.norm = Gemma3nAudioCumulativeGroupNorm(
+            num_channels=out_channels,  # Channels of the conv output
+            feature_dims=(f_out_conv,),  # The frequency dimension size after conv
+            eps=self.config.sscp_conv_group_norm_eps,
+        )
+
+        self.activation = nn.ReLU()
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        # Input audio_encodings is [B, C_in, T_in, F_in] (e.g., C_in=1)
+        # manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom)
+        # F.pad applies to last two dims: F_in then T_in
+        audio_encodings_padded = F.pad(audio_encodings, self.manual_padding, mode="constant", value=0.0).to(
+            self.conv.weight.dtype
+        )
+        # Expected padded shape for F_in, k_w=3, pad_F=(1,1) -> F_padded = F_in+2
+        # Expected padded shape for T_in, k_h=3, pad_T=(0,2) -> T_padded = T_in+2
+        audio_encodings_conv = self.conv(audio_encodings_padded)
+        # Expected conv output shape: [B, C_out, T_out, F_out]
+        # Input to norm is [B, T_out, F_out, C_out]
+        x_for_norm = audio_encodings_conv.permute(0, 2, 3, 1).contiguous()
+        x_normed = self.norm(x_for_norm)
+        # Output of norm is [B, T_out, F_out, C_out], permute back to [B, C_out, T_out, F_out]
+        audio_encodings_normed = x_normed.permute(0, 3, 1, 2).contiguous()
+        return self.activation(audio_encodings_normed)
+
+
+class Gemma3nAudioSubSampleConvProjection(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        current_f_for_block_input = config.input_feat_size  # Start with original feature dim
+        calculated_block_padding = []
+        calculated_f_out_dims = []  # Tracking frequency dimension output sizes
+
+        for i in range(2):  # Assuming 2 conv layers as per sscp_conv_... arrays
+            kernel_h, kernel_w = config.sscp_conv_kernel_size[i]
+            stride_h, stride_w = config.sscp_conv_stride_size[i]
+
+            # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like
+            # JAX 'reverse_causal' padding is (0, kernel_size - 1)
+            pad_t_top = 0
+            pad_t_bottom = kernel_h - 1
+
+            # Frequency Padding (Width for Conv2d)
+            # Based on JAX effective padding (1,1) for F_in=10, K_w=3, S_w=2
+            # and the successful test configuration.
+            # If kernel/stride/input_freq for frequency changes, this might need re-evaluation
+            # to match generic JAX 'SAME' behavior if it differs.
+            pad_f_left = 1
+            pad_f_right = 1
+
+            manual_padding_tuple = (
+                pad_f_left,
+                pad_f_right,
+                pad_t_top,
+                pad_t_bottom,
+            )
+            calculated_block_padding.append(manual_padding_tuple)
+
+            # Calculate output frequency dimension after this convolution
+            # This uses the actual padding applied and kernel/stride.
+            f_in_padded = current_f_for_block_input + pad_f_left + pad_f_right
+            f_out_after_conv = (f_in_padded - kernel_w) // stride_w + 1  # Assuming dilation_w = 1
+            calculated_f_out_dims.append(f_out_after_conv)
+            current_f_for_block_input = f_out_after_conv
+
+        self.conv_0 = Gemma3nAudioSSCPConvBlock(
+            idx=0,
+            input_freq_dim=config.input_feat_size,  # Pass original feature dim
+            config=config,
+            manual_padding=calculated_block_padding[0],
+        )
+        self.conv_1 = Gemma3nAudioSSCPConvBlock(
+            idx=1,
+            input_freq_dim=calculated_f_out_dims[0],  # Output freq dim from conv_0
+            config=config,
+            manual_padding=calculated_block_padding[1],
+        )
+        final_c_out = config.sscp_conv_channel_size[-1]
+        final_f_out = calculated_f_out_dims[-1]  # Final frequency dimension
+        self.input_proj_in_features = final_c_out * final_f_out
+        self.input_proj_linear = nn.Linear(self.input_proj_in_features, self.config.hidden_size, bias=False)
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        # audio_encodings is [B, T, F_in]
+        # Reshape to [B, 1, T, F_in] (Batch, Channels=1, Height=Time, Width=F_in)
+        audio_encodings_reshaped = audio_encodings.unsqueeze(1)
+        x = self.conv_0(audio_encodings_reshaped)
+        x = self.conv_1(x)
+        # x from conv_1 is [B, C_out_1, T_out_1, F_out_1]
+        b, c_out, t_out, f_out = x.shape
+        # Permute to [B, T_out_1, F_out_1, C_out_1] then flatten F_out_1 and C_out_1
+        x_permuted = x.permute(0, 2, 3, 1).contiguous()
+        output_flattened = x_permuted.view(b, t_out, f_out * c_out)
+        output = self.input_proj_linear(output_flattened)
+        return output
+
+
+class Gemma3nAudioConformerAttention(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+        self.post_in_features = self.config.hidden_size
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+        self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.attn = Gemma3nAudioAttention(config)
+        self.post = nn.Linear(self.post_in_features, self.config.hidden_size, bias=False)
+        self.post_norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor:
+        audio_encodings_input_to_attn = audio_encodings
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings_norm = self.pre_attn_norm(audio_encodings)
+        # Output of self.attn is [B, T, NumHeads, HeadDim]
+        audio_encodings_attn_out = self.attn(audio_encodings_norm, audio_mel_mask)
+
+        # Reshape from [B, T, NumHeads, HeadDim] to [B, T, NumHeads * HeadDim]
+        # NumHeads * HeadDim = hidden_size
+        b, t, num_heads, head_dim = audio_encodings_attn_out.shape
+        audio_encodings_reshaped = audio_encodings_attn_out.reshape(b, t, num_heads * head_dim)
+
+        audio_encodings = self.post(audio_encodings_reshaped)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        return audio_encodings_input_to_attn + self.post_norm(audio_encodings)
+
+
+class Gemma3nAudioConformerFeedForward(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+
+        self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.ffw_layer_1 = nn.Linear(self.config.hidden_size, self.config.hidden_size * 4, bias=False)
+        self.ffw_layer_2 = nn.Linear(self.config.hidden_size * 4, self.config.hidden_size, bias=False)
+        self.post_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.post_layer_scale = torch.tensor(self.config.conf_residual_weight)
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        residual = audio_encodings
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings: torch.Tensor = self.ffw_layer_1(audio_encodings)
+        audio_encodings = nn.functional.silu(audio_encodings)
+        audio_encodings: torch.Tensor = self.ffw_layer_2(audio_encodings)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings = self.post_layer_norm(audio_encodings)
+        return residual + (audio_encodings * self.post_layer_scale)
+
+
+class Gemma3nAudioConformerLightConv1d(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.linear_start = nn.Linear(self.config.hidden_size, self.config.hidden_size * 2, bias=False)
+        self.depthwise_conv1d = nn.Conv1d(
+            in_channels=self.config.hidden_size,
+            out_channels=self.config.hidden_size,
+            kernel_size=self.config.conf_conv_kernel_size,
+            stride=1,
+            padding=0,  # Manual causal padding
+            groups=self.config.hidden_size,  # Depthwise
+            bias=False,
+        )
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+        self.conv_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.linear_end = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False)
+
+        self.causal_padding = self.config.conf_conv_kernel_size - 1
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_residual = audio_encodings  # Save for residual connection
+
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings = self.linear_start(audio_encodings)
+        audio_encodings = torch.nn.functional.glu(audio_encodings, dim=-1)
+        # Permute for Conv1d: [B, T, D] -> [B, D, T]
+        audio_encodings_permuted = audio_encodings.permute(0, 2, 1)
+        # Apply manual causal padding
+        audio_encodings_permuted_padded = F.pad(audio_encodings_permuted, (self.causal_padding, 0))
+        audio_encodings = self.depthwise_conv1d(audio_encodings_permuted_padded)
+        # Permute back: [B, D, T_out] -> [B, T_out, D]
+        audio_encodings = audio_encodings.permute(0, 2, 1)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings = self.conv_norm(audio_encodings)
+        audio_encodings = nn.functional.silu(audio_encodings)
+        audio_encodings = self.linear_end(audio_encodings)
+        output = audio_encodings + audio_encodings_residual
+        return output
+
+
+class Gemma3nAudioConformerBlock(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.ffw_layer_start = Gemma3nAudioConformerFeedForward(self.config)
+        self.attention = Gemma3nAudioConformerAttention(self.config)
+        self.lconv1d = Gemma3nAudioConformerLightConv1d(self.config)
+        self.ffw_layer_end = Gemma3nAudioConformerFeedForward(self.config)
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+        self.norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor:
+        audio_encodings = self.ffw_layer_start(audio_encodings)
+        audio_encodings = self.attention(audio_encodings, audio_mel_mask)
+        validity_mask_for_lconv = ~audio_mel_mask  # True for valid
+        audio_encodings_for_lconv_input = audio_encodings * validity_mask_for_lconv.unsqueeze(-1).to(
+            audio_encodings.dtype
+        )
+        audio_encodings = self.lconv1d(audio_encodings_for_lconv_input)
+
+        audio_encodings = self.ffw_layer_end(audio_encodings)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        output = self.norm(audio_encodings)
+        return output
+
+
+class Gemma3nAudioEncoder(PreTrainedModel):
+    """
+    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
+    """
+
+    config: Gemma3nAudioConfig
+
+    main_input_name = "audio_mel"
+
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.subsample_conv_projection = Gemma3nAudioSubSampleConvProjection(config)
+        self.conformer = nn.ModuleList(
+            [Gemma3nAudioConformerBlock(config) for _ in range(config.conf_num_hidden_layers)]
+        )
+
+    def forward(
+        self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> tuple[torch.Tensor, torch.BoolTensor]:
+        """Encodes a batch of MELs.
+
+        Args:
+            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
+              mel_bins].
+
+        Returns:
+            audio_encodings: a torch.Tensor of shape
+                `[batch_size, self.config.audio_soft_tokens_per_image,
+                self.config.audio_config.hidden_size]`
+            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
+        """
+        audio_encodings = self.subsample_conv_projection(audio_mel)  # audio_encodings: [B, T_sub, D]
+
+        # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub)
+        t_sub = audio_encodings.shape[1]
+
+        time_stride_product = 1
+        for stride_pair_idx in range(len(self.config.sscp_conv_stride_size)):
+            time_stride_product *= self.config.sscp_conv_stride_size[stride_pair_idx][0]
+
+        # Create indices for gathering from the original mask.
+        # These indices map to original time steps corresponding to the start of each
+        # receptive field in the subsampled output.
+        indices = torch.arange(t_sub, device=audio_mel_mask.device) * time_stride_product
+        indices = torch.clamp(indices, max=audio_mel_mask.shape[1] - 1)  # Ensure indices are valid
+
+        # Expand indices for batch compatibility if B > 1 and indices is 1D.
+        if audio_mel_mask.ndim > 1 and indices.ndim == 1:
+            indices = indices.unsqueeze(0).expand(audio_mel_mask.shape[0], -1)  # [B, T_sub]
+        elif (
+            audio_mel_mask.ndim == indices.ndim
+            and audio_mel_mask.shape[0] == 1
+            and indices.shape[0] != 1
+            and t_sub == indices.shape[0]
+        ):
+            # Handle case where B=1 but indices became [T_sub] instead of [1, T_sub]
+            indices = indices.unsqueeze(0)
+
+        current_mask = torch.gather(audio_mel_mask, 1, indices)  # [B, T_sub]
+
+        for block in self.conformer:
+            audio_encodings = block(audio_encodings, current_mask)  # Pass the processed mask
+
+        if self.config.conf_reduction_factor > 1:
+            audio_encodings = audio_encodings[:, :: self.config.conf_reduction_factor]
+            # Reduce the mask as well
+            current_mask = current_mask[:, :: self.config.conf_reduction_factor]
+
+        audio_encodings = audio_encodings.masked_fill(current_mask.unsqueeze(-1), 0.0)
+        return audio_encodings, current_mask
+
+
+class Gemma3nTextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+
+
+class Gemma3nTextLaurelBlock(nn.Module):
+    """Learned Augmented Residual Layer"""
+
+    def __init__(self, config: Gemma3nTextConfig):
+        super().__init__()
+        self.config = config
+
+        self.linear_left = nn.Linear(self.config.hidden_size, self.config.laurel_rank, bias=False)
+        self.linear_right = nn.Linear(self.config.laurel_rank, self.config.hidden_size, bias=False)
+        self.post_laurel_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        laurel_hidden_states: torch.Tensor = self.linear_left(hidden_states)
+        laurel_hidden_states: torch.Tensor = self.linear_right(laurel_hidden_states)
+        normed_laurel_hidden_states = self.post_laurel_norm(laurel_hidden_states)
+        return hidden_states + normed_laurel_hidden_states
+
+
+class Gemma3nTextMLP(nn.Module):
+    def __init__(self, config: Gemma3nTextConfig, layer_idx: int = 0):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size[layer_idx]
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+        self.activation_sparsity = config.activation_sparsity_pattern[layer_idx]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_proj = self.gate_proj(hidden_states)
+        if self.activation_sparsity > 0.0:
+            gate_proj = self._gaussian_topk(gate_proj)
+        activations = self.act_fn(gate_proj)
+        up_proj = self.up_proj(hidden_states)
+        down_proj = self.down_proj(activations * up_proj)
+        return down_proj
+
+    def _gaussian_topk(self, inputs: torch.Tensor) -> torch.Tensor:
+        target_sparsity_tensor = torch.tensor(self.activation_sparsity, dtype=torch.float32, device=inputs.device)
+        # normal_dist and std_multiplier are adapted from jax.scipy.stats.norm.ppf().
+        #
+        # References:
+        #   *   https://docs.jax.dev/en/latest/_autosummary/jax.scipy.stats.norm.ppf.html
+        #   *   https://pytorch.org/docs/stable/distributions.html#torch.distributions.normal.Normal
+        #   *   https://pytorch.org/docs/stable/distributions.html#torch.distributions.transformed_distribution.TransformedDistribution.icdf
+        normal_dist = torch.distributions.normal.Normal(0, 1)
+        std_multiplier: torch.Tensor = normal_dist.icdf(target_sparsity_tensor)
+        std_multiplier = std_multiplier.type(inputs.dtype)
+        inputs_mean = torch.mean(inputs, dim=-1, keepdim=True)
+        inputs_std = torch.std(inputs, dim=-1, keepdim=True, unbiased=False)
+        cutoff_x = inputs_mean + inputs_std * std_multiplier
+        return nn.functional.relu(inputs - cutoff_x)
+
+
+class Gemma3nTextAltUp(nn.Module):
+    """Alternating Updates (AltUp)
+
+    The AltUp module wraps transformer layers. The `predict` step modifies the
+    input to the transformer layer, and the `correct` step propagates the output
+    of the transformer layer to the sparsely updated dimensions.
+
+    See more in the research paper:
+
+    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
+    """
+
+    def __init__(self, config: Gemma3nTextConfig):
+        super().__init__()
+        self.config = config
+        self.correct_output_scale = nn.Parameter(torch.zeros(self.config.hidden_size))
+        self.correction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs, bias=False)
+        self.prediction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs**2, bias=False)
+        self.modality_router = nn.Linear(self.config.hidden_size, self.config.altup_num_inputs, bias=False)
+        self.router_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.register_buffer("router_input_scale", torch.tensor(self.config.hidden_size**-1.0), persistent=False)
+
+    def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
+        router_inputs = self.router_norm(x) * self.router_input_scale
+        routed = self.modality_router(router_inputs)
+        return torch.tanh(routed.float()).type_as(x)
+
+    def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Predicts the output of a layer using a trainable map.
+
+        Args:
+            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
+                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
+
+        Returns:
+            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
+        """
+        modalities = self.compute_router_modalities(hidden_states[self.config.altup_active_idx])
+
+        if self.training and self.config.altup_coef_clip is not None:
+            self.prediction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip)
+
+        # Project and then transpose all 2D matrices contained so that mulmat gives the correct result
+        all_coefs: torch.Tensor = (
+            self.prediction_coefs(modalities)
+            .reshape(*modalities.shape[:-1], self.config.altup_num_inputs, self.config.altup_num_inputs)
+            .permute(0, 1, 3, 2)
+        )
+
+        # permute hidden_states to [batch_size, num_tokens, hidden_size, altup_num_inputs]
+        predictions = torch.matmul(hidden_states.permute(1, 2, 3, 0), all_coefs)
+        predictions = predictions.permute(3, 0, 1, 2)  # undo the permute
+        predictions += hidden_states  # add the original input
+        return predictions.contiguous().type_as(hidden_states)
+
+    def correct(self, predictions: torch.Tensor, activated: torch.Tensor) -> torch.Tensor:
+        """Corrects the predictions relative to the
+
+        Args:
+            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
+                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
+            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.
+
+        Returns:
+            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
+                predictions relative to the activated input embeddings.
+        """
+        modalities = self.compute_router_modalities(activated)
+        innovation = activated - predictions[self.config.altup_active_idx]  # (batch, num_tokens, hidden_size)
+        innovation = innovation.repeat(self.config.altup_num_inputs, 1, 1, 1)  # Repeat on dim0 to match predictions
+
+        if self.config.altup_coef_clip is not None:
+            self.correction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip)
+
+        # all_coefs adapted from jax.numpy.einsum("...p,pi->...i", ...)
+        # Permute to (altup_num_inputs, batch_size, num_tokens) as the last dim is a scalar applied to each altup input
+        # and expand on dim1 for broadcastability
+        all_coefs: torch.Tensor = self.correction_coefs(modalities) + 1.0
+        all_coefs = all_coefs.permute(2, 0, 1).unsqueeze(-1)
+
+        corrected = torch.mul(innovation, all_coefs)
+        corrected += predictions  # add the original input
+        return corrected.contiguous().type_as(activated)
+
+    def forward(self, corrected: torch.Tensor) -> torch.Tensor:
+        """
+        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
+        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
+        `scale_corrected_output`
+        """
+        return (corrected.type_as(self.correct_output_scale) * self.correct_output_scale).type_as(corrected)
+
+    def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
+        """Scales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size]."""
+        return self.forward(corrected)
+
+
+class Gemma3nTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Gemma3nTextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    position_ids: Optional[torch.Tensor] = None,
+    unsqueeze_dim: int = 1,
+):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        x (`torch.Tensor`): The tensor to embed.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+class Gemma3nTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Gemma3nTextConfig, layer_idx: int):
+        super().__init__()
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.attention_dropout = self.config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.q_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.v_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps, with_scale=False)
+
+        first_kv_shared_layer_idx = self.config.num_hidden_layers - self.config.num_kv_shared_layers
+        self.is_kv_shared_layer = layer_idx >= first_kv_shared_layer_idx > 0
+        prev_layers = config.layer_types[:first_kv_shared_layer_idx]
+        if self.is_kv_shared_layer:
+            # For shared layers, find the last non-shared layer of the same type before sharing starts
+            self.kv_shared_layer_index = len(prev_layers) - 1 - prev_layers[::-1].index(config.layer_types[layer_idx])
+            self.store_full_length_kv = False
+        else:
+            self.kv_shared_layer_index = None
+            # For non-shared layers, store full-length kv if this is the last non-shared layer of its type
+            self.store_full_length_kv = layer_idx == len(prev_layers) - 1 - prev_layers[::-1].index(
+                config.layer_types[layer_idx]
+            )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.config.head_dim)
+
+        cos, sin = position_embeddings
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        query_states = self.q_norm(query_states)
+        query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2)
+        query_states = query_states.transpose(1, 2)
+
+        # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer
+        if self.is_kv_shared_layer and past_key_values is not None:
+            key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index]
+            # Device of past layer may be different from current one
+            key_states = key_states.to(query_states.device)
+            value_states = value_states.to(query_states.device)
+        else:
+            key_states = self.k_proj(hidden_states).view(hidden_shape)
+            key_states = self.k_norm(key_states)
+            key_states = apply_rotary_pos_emb(key_states, cos, sin, unsqueeze_dim=2)
+            key_states = key_states.transpose(1, 2)
+
+            value_states = self.v_proj(hidden_states).view(hidden_shape)
+            value_states = self.v_norm(value_states)
+            value_states = value_states.transpose(1, 2)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+                "sliding_window": self.sliding_window,
+            }
+            if not self.is_kv_shared_layer:
+                key_states, value_states = past_key_values.update(
+                    key_states, value_states, self.layer_idx, cache_kwargs
+                )
+            if self.store_full_length_kv:
+                if not hasattr(past_key_values, "shared_layers"):
+                    past_key_values.shared_layers = {}
+                past_key_values.shared_layers[self.layer_idx] = key_states, value_states
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=1.0,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3nTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3nTextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = Gemma3nTextAttention(config, layer_idx)
+        self.mlp = Gemma3nTextMLP(config, layer_idx=layer_idx)
+        self.input_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+        self.altup = Gemma3nTextAltUp(config)
+        self.laurel = Gemma3nTextLaurelBlock(config)
+        self.per_layer_input_gate = nn.Linear(self.hidden_size, self.hidden_size_per_layer_input, bias=False)
+        self.per_layer_projection = nn.Linear(self.hidden_size_per_layer_input, self.hidden_size, bias=False)
+        self.post_per_layer_input_norm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        per_layer_input: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        predictions = self.altup.predict(hidden_states)
+        active_prediction = predictions[self.config.altup_active_idx]
+
+        active_prediction_normed = self.input_layernorm(active_prediction)
+        laurel_output = self.laurel(active_prediction_normed)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        attn, self_attn_weights = self.self_attn(
+            hidden_states=active_prediction_normed,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        attn = self.post_attention_layernorm(attn)
+
+        attn_gated = active_prediction + attn
+        attn_laurel = (attn_gated + laurel_output) / math.sqrt(2)
+
+        attn_norm = self.pre_feedforward_layernorm(attn_laurel)
+        attn_ffw = self.mlp(attn_norm)
+        attn_ffw_norm = self.post_feedforward_layernorm(attn_ffw)
+        attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm
+        corrected_predictions = self.altup.correct(predictions, attn_ffw_laurel_gated)
+
+        first_prediction = corrected_predictions[self.config.altup_active_idx].clone()
+        if self.config.altup_correct_scale:
+            first_prediction = self.altup.scale_corrected_output(first_prediction)
+
+        # per_layer_input_gate adapted from jax.numpy.einsum("btd,dp->btp", ...)
+        first_prediction = self.per_layer_input_gate(first_prediction)
+        first_prediction = self.act_fn(first_prediction)
+        first_prediction = torch.multiply(first_prediction, per_layer_input)
+
+        # per_layer_projection adapted from jax.numpy.einsum("btp,pd->btd", ...)
+        first_prediction = self.per_layer_projection(first_prediction)
+        first_prediction = self.post_per_layer_input_norm(first_prediction)
+        corrected_predictions[1:] += first_prediction
+
+        outputs = (corrected_predictions,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class Gemma3nPreTrainedModel(PreTrainedModel):
+    config: Gemma3nConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Gemma3nTextDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Gemma3nTextDecoderLayer,
+        "attentions": Gemma3nTextAttention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Gemma3nAudioCumulativeGroupNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Gemma3nAudioAttention):
+            module.per_dim_scale.data.zero_()
+        elif isinstance(module, Gemma3nTextAltUp):
+            module.correct_output_scale.data.zero_()
+
+
+@auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.")
+class Gemma3nTextModel(Gemma3nPreTrainedModel):
+    config: Gemma3nTextConfig
+
+    def __init__(self, config: Gemma3nTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # Gemma3n downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3nTextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+        self.layers = nn.ModuleList(
+            [Gemma3nTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+        self.norm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma3nTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # TODO (raushan): Fix this after RoPE refactor. For now we hack it by
+        # reassigning thetas when we want to create a local RoPE layer. Config
+        # defaults should hold values for global RoPE.
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config)
+
+        self.hidden_size = config.hidden_size
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+
+        self.embed_tokens_per_layer = Gemma3nTextScaledWordEmbedding(
+            config.vocab_size_per_layer_input,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            self.padding_idx,
+            embed_scale=config.hidden_size_per_layer_input**0.5,
+        )
+
+        self.per_layer_model_projection = nn.Linear(
+            self.hidden_size,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            bias=False,
+        )
+
+        self.per_layer_projection_norm = Gemma3nRMSNorm(config.hidden_size_per_layer_input, eps=config.rms_norm_eps)
+
+        self.altup_projections = nn.ModuleList(
+            [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)]
+        )
+
+        self.altup_unembed_projections = nn.ModuleList(
+            [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)]
+        )
+
+        self.register_buffer("per_layer_projection_scale", torch.tensor(self.hidden_size**-0.5), persistent=False)
+        self.register_buffer("per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        r"""
+        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
+            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if input_ids is not None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            per_layer_inputs = self.get_per_layer_inputs(input_ids)
+
+        per_layer_inputs = self.project_per_layer_inputs(inputs_embeds, per_layer_inputs)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states_0 = inputs_embeds
+
+        # Initialize RoPE embeddings
+        position_embeddings_global = self.rotary_emb(hidden_states_0, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states_0, position_ids)
+
+        # Expand hidden_states to support per-layer inputs
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5
+        epsilon_tensor = torch.tensor(1e-5)
+
+        temp_hidden_states = [hidden_states_0]
+        for i in range(1, self.config.altup_num_inputs):
+            # altup_proj adapted from jax.numpy.einsum("btp,pd->btd", ...)
+            altup_proj = self.altup_projections[i - 1](hidden_states_0)
+            current_hidden_state = altup_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
+            new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
+            new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device)))
+            current_hidden_state = current_hidden_state * target_magnitude / new_magnitude
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(temp_hidden_states, dim=0)  # [num_altup_inputs, batch, seq_len, hidden_size]
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            causal_mask = causal_mask_mapping[decoder_layer.attention_type]
+            per_layer_input = per_layer_inputs[:, :, decoder_layer.layer_idx, :]
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings_global,
+                position_embeddings_local,
+                per_layer_input,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        # add hidden states from the last decoder layer (but before reprojecting to stay consistent with layer output)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # Per-layer inputs to single output
+        target_magnitude = torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5
+        temp_hidden_states = [hidden_states[0]]
+        for i in range(1, self.config.altup_num_inputs):
+            # altup_unembed_projections adapted from jax.numpy.einsum("btp,pd->btd", ...)
+            altup_unemb_proj: torch.Tensor = self.altup_unembed_projections[i - 1](hidden_states[i])
+            current_hidden_state = altup_unemb_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
+            new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
+            new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device)))
+            current_hidden_state = current_hidden_state * target_magnitude / new_magnitude
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(temp_hidden_states)
+        hidden_states = torch.mean(hidden_states, dim=0)
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def get_per_layer_inputs(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        return self.embed_tokens_per_layer(input_ids).reshape(
+            *input_ids.shape,
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+
+    def project_per_layer_inputs(
+        self,
+        inputs_embeds: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        per_layer_projection: torch.Tensor = self.per_layer_model_projection(inputs_embeds)
+        per_layer_projection *= self.per_layer_projection_scale.to(
+            dtype=inputs_embeds.dtype, device=per_layer_projection.device
+        )
+        per_layer_projection = per_layer_projection.reshape(
+            *inputs_embeds.shape[:-1],
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(per_layer_projection)
+
+        if per_layer_inputs is None:
+            return per_layer_projection
+
+        if per_layer_projection.shape != per_layer_inputs.shape:
+            # per-layer inputs are sometimes padded with zeros, slice the relevant embeddings.
+            per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :]
+
+        return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale.to(
+            dtype=inputs_embeds.dtype, device=per_layer_projection.device
+        )
+
+
+@auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.")
+class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config: Gemma3nTextConfig
+    base_model_prefix = "model"
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
+
+    def __init__(self, config: Gemma3nTextConfig):
+        super().__init__(config)
+        self.model = Gemma3nTextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM
+
+        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma3n models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Gemma3nMultimodalEmbedder(nn.Module):
+    """Embeds token ids or soft tokens for multimodal content into language model space."""
+
+    def __init__(
+        self,
+        multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig],
+        text_config: Gemma3nTextConfig,
+    ):
+        super().__init__()
+
+        self.multimodal_hidden_size = multimodal_config.hidden_size
+        self.eps = multimodal_config.rms_norm_eps
+        self.vocab_offset = multimodal_config.vocab_offset
+        self.vocab_size = multimodal_config.vocab_size
+        self.text_hidden_size = text_config.hidden_size
+
+        self.embedding = nn.Embedding(self.vocab_size, self.multimodal_hidden_size)
+        self.hard_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps)
+        self.soft_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps)
+        self.embedding_projection = nn.Linear(self.multimodal_hidden_size, self.text_hidden_size, bias=False)
+        self.embedding_post_projection_norm = Gemma3nRMSNorm(self.text_hidden_size, eps=self.eps, with_scale=False)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Embeds token ids or soft tokens for multimodal content into language model space.
+
+        Args:
+            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
+                `[vocab_offset, vocab_offset + vocab_size)`.
+            inputs_embeds: A torch.Tensor containing the soft tokens to embed.
+
+        Returns:
+            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is not None:
+            emb_norm = self.soft_embedding_norm(inputs_embeds)
+        else:
+            hard_emb = self.embedding(input_ids - self.vocab_offset)
+            emb_norm = self.hard_embedding_norm(hard_emb)
+
+        emb_norm_proj = self.embedding_projection(emb_norm)
+        return self.embedding_post_projection_norm(emb_norm_proj)
+
+
+@auto_docstring(
+    custom_intro="""
+    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
+    language modeling head.
+    """
+)
+class Gemma3nModel(Gemma3nPreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+
+    def __init__(self, config: Gemma3nConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.vocab_size = config.text_config.vocab_size
+
+        language_model = AutoModel.from_config(config=config.text_config)
+        self.language_model = language_model
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.vocab_size_per_layer_input = config.text_config.vocab_size_per_layer_input
+        self.audio_tower = AutoModel.from_config(config.audio_config)
+        self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, config.text_config)
+        self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config, config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(
+            pixel_values=pixel_values, do_pooling=False, return_dict=True
+        ).last_hidden_state
+        # Convert from (batch, channels, height, width) to (batch, height * width, channels) where:
+        # height == width and height * width == Gemma3nConfig.vision_soft_tokens_per_image.
+        vision_outputs = vision_outputs.reshape(
+            vision_outputs.shape[0],
+            self.config.vision_config.hidden_size,
+            self.config.vision_soft_tokens_per_image,
+        ).permute(0, 2, 1)
+        # Normalize and embed the soft tokens into language model space.
+        vision_outputs *= self.config.vision_config.hidden_size**0.5
+        return self.embed_vision(inputs_embeds=vision_outputs)
+
+    def get_placeholder_mask(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_features: Optional[torch.FloatTensor] = None,
+        audio_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_audio_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            ).all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0] * image_features.shape[1]}"
+            )
+
+        n_audio_tokens = special_audio_mask.sum()
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if audio_features is not None and inputs_embeds[special_audio_mask].numel() != audio_features.numel():
+            raise ValueError(
+                f"Audio features and image tokens do not match: tokens: {n_audio_tokens}, features {audio_features.shape[0] * audio_features.shape[1]}"
+            )
+
+        return special_image_mask, special_audio_mask
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,  # text inputs
+        pixel_values: Optional[torch.FloatTensor] = None,  # vision inputs
+        input_features: Optional[torch.FloatTensor] = None,  # audio inputs
+        attention_mask: Optional[torch.Tensor] = None,
+        input_features_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **lm_kwargs,
+    ) -> Gemma3nCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+
+        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")
+
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if input_ids is not None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            # Prepare per-layer inputs from inputs_ids
+            per_layer_inputs_mask = torch.logical_and(input_ids >= 0, input_ids < self.vocab_size_per_layer_input)
+            per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids))
+            per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens)
+
+            # Handle vision tokens (>= embed_vision.vocab_offset and < embed_audio.vocab_offset)
+            vision_mask = torch.logical_and(
+                input_ids >= self.embed_vision.vocab_offset, input_ids < self.embed_audio.vocab_offset
+            )
+            dummy_vision_token_id = self.embed_vision.vocab_offset + self.embed_vision.vocab_size - 1
+            vision_input_ids = torch.where(vision_mask, input_ids, dummy_vision_token_id).to(inputs_embeds.device)
+            vision_embeds = self.embed_vision(input_ids=vision_input_ids)
+            expanded_vision_mask = vision_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds = torch.where(expanded_vision_mask, vision_embeds, inputs_embeds)
+
+            # Handle audio tokens (>= embed_audio.vocab_offset)
+            audio_mask = input_ids >= self.embed_audio.vocab_offset
+            dummy_audio_token_id = self.embed_audio.vocab_offset + self.embed_audio.vocab_size - 1
+            audio_input_ids = torch.where(audio_mask, input_ids, dummy_audio_token_id).to(inputs_embeds.device)
+            audio_embeds = self.embed_audio(input_ids=audio_input_ids)
+            expanded_audio_mask = audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds = torch.where(expanded_audio_mask, audio_embeds, inputs_embeds)
+        else:
+            per_layer_inputs = None
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # Merge text and audio
+        if input_features is not None and input_features_mask is not None:
+            audio_features, audio_mask = self.get_audio_features(input_features, ~input_features_mask)
+
+            # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
+            # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will
+            # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
+            # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
+            # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab.
+            audio_padding_toks = torch.tensor([[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device)
+            audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks)
+            audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features)
+
+            audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
+            extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len
+            extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim)
+
+            audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
+            audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            _, special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
+
+        outputs = self.language_model(
+            input_ids=None,
+            per_layer_inputs=per_layer_inputs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        return Gemma3nModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values if use_cache else None,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            audio_hidden_states=audio_features if input_features is not None else None,
+        )
+
+    def get_audio_features(
+        self, input_features: torch.Tensor, input_features_mask: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Projects the last hidden state from the audio encoder into language model space.
+
+        Args:
+            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
+               The tensors corresponding to the input audio.
+            input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
+               The attention mask for the input audio.
+
+        Returns:
+            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
+        """
+        audio_outputs, audio_mask = self.audio_tower(input_features, input_features_mask)
+        return self.embed_audio(inputs_embeds=audio_outputs), audio_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
+    head.
+    """
+)
+class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+    base_model_prefix = "model"
+
+    def __init__(self, config: Gemma3nConfig):
+        super().__init__(config)
+        self.model = Gemma3nModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(self, pixel_values):
+        return self.model.get_image_features(pixel_values)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        raise AttributeError("Use embed_vision instead of multi_modal_projector.")
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,  # text inputs
+        pixel_values: Optional[torch.FloatTensor] = None,  # vision inputs
+        input_features: Optional[torch.FloatTensor] = None,  # audio inputs
+        attention_mask: Optional[torch.Tensor] = None,
+        input_features_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Gemma3nCausalLMOutputWithPast:
+        r"""
+        input_features_mask (torch.Tensor, *optional*, defaults to None):
+            The attention mask for the input audio.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in
+            `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenizer=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            input_features=input_features,
+            attention_mask=attention_mask,
+            input_features_mask=input_features_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            cache_position=cache_position,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if (final_logit_softcapping := self.config.get_text_config().final_logit_softcapping) is not None:
+            logits = logits / final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        return Gemma3nCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            audio_hidden_states=outputs.audio_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        input_features=None,
+        attention_mask=None,
+        input_features_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, multimodal inputs should be None because input ids do not contain special
+        # tokens anymore. Otherwise multimodal inputs should be passed to model.
+        # NOTE: use_cache=False always needs pixel_values, input_features, and input_features_mask
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["input_features"] = input_features
+            model_inputs["input_features_mask"] = input_features_mask
+
+        return model_inputs
+
+    @property
+    def audio_tower(self):
+        return self.model.audio_tower
+
+
+__all__ = [
+    "Gemma3nAudioEncoder",
+    "Gemma3nForCausalLM",
+    "Gemma3nForConditionalGeneration",
+    "Gemma3nModel",
+    "Gemma3nPreTrainedModel",
+    "Gemma3nTextModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/modular_gemma3n.py b/phivenv/Lib/site-packages/transformers/models/gemma3n/modular_gemma3n.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec712fc1f56d36d37ab5d6c34e6db2ad70ba3c8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3n/modular_gemma3n.py
@@ -0,0 +1,2686 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+from collections.abc import Callable, Sequence
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModel
+from ..gemma2.configuration_gemma2 import Gemma2Config
+from ..gemma2.modeling_gemma2 import (
+    Gemma2MLP,
+    Gemma2PreTrainedModel,
+    Gemma2RotaryEmbedding,
+    eager_attention_forward,
+    rotate_half,
+)
+from ..gemma3.modeling_gemma3 import (
+    Gemma3Attention,
+    Gemma3DecoderLayer,
+    Gemma3ForCausalLM,
+    Gemma3RMSNorm,
+    Gemma3TextModel,
+    Gemma3TextScaledWordEmbedding,
+)
+from ..paligemma.modeling_paligemma import (
+    PaliGemmaCausalLMOutputWithPast,
+    PaliGemmaForConditionalGeneration,
+    PaliGemmaModel,
+    PaligemmaModelOutputWithPast,
+)
+from ..timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3nTextConfig(Gemma2Config, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3nTextModel`]. It is used to instantiate an
+    Gemma3nTextModel model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.
+    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+
+    Configuration objects that inherit from [`Gemma3nTextConfig`] and can be used to control the model outputs. Read
+    the documentation from [`Gemma3nTextConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262400):
+            Vocabulary size of the Gemma3nText model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Gemma3nTextModel`]
+        vocab_size_per_layer_input (`int`, *optional*, defaults to 262144):
+            Vocabulary size of the per-layer text embeddings that augment the standard embeddings.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        hidden_size_per_layer_input (`int`, *optional*, defaults to 256):
+            Dimension of the hidden representations for per-layer emebeddings.
+        intermediate_size (`int` or `Sequence[int]`, *optional*, defaults to 16384):
+            Dimension of the MLP representations. MatFormer configurations may wish to provide a sequence of integers
+            to account for vairable intermediate_size values across layers. In such cases,
+            `len(intermediate_size) == num_hidden_layers`.
+        num_hidden_layers (`int`, *optional*, defaults to 35):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout this
+            [paper](https://huggingface.co/papers/2305.13245). If not specified, will default to `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to
+            `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"`
+            activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention.
+            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we
+            recommend you to update this value accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 512):
+            This is the size of the sliding window used by local attention layers.
+        layer_types (`Optional`, *optional*):
+            A sequence of strings defining the attention type for that layer as either "sliding_attention" or
+            "full_attention". If not provided, `layer_types` will de inferred from `num_hidden_layers` using a pattern
+            of four "sliding_attention" layers followed one "full_attention". The last layer in the model should always
+            be a "full_attention" layer.
+        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
+            Scaling factor when applying tanh softcapping on the logits.
+        altup_active_idx (`int`, *optional*, defaults to 0):
+            The index of the prediction from which AltUp will compute additional predictions or correct
+        altup_coef_clip (`float`, *optional*, defaults to 120.0):
+            The maximum amplitude of an AltUp prediction or correction coeficient weight.
+        altup_correct_scale (`bool`, *optional*, defaults to `True`):
+            If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`.
+        altup_num_inputs (`int`, *optional*, defaults to 4):
+            The number of predictions that AltUp should be make given the input sequence.
+        num_kv_shared_layers (`int`, *optional*, defaults to 15):
+            The number of layer that share KV cache values. During the forward pass, the last `num_kv_shared_layers`
+            layers in the model "share" the KV values in that each local and global layer in this range uses the KV
+            cache values computed for the last local or global layer, respectively, before entering this range. The
+            value should be a multiple of the attention pattern size (see `layer_types` parameter).
+        laurel_rank (int, *optional*, defaults to 64):
+            The intermediate size for the linear projections in the Learned Augmented Residual Layer.
+        activation_sparsity_pattern (Sequence[float], *optional*):
+            The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must
+            explicitly provide a sparsity value for each layer in the model. By default, the first 10 layers are
+            sparse with a sparsity factor of 0.95 and the rest are dense.
+
+    ```python
+    >>> from transformers import Gemma3nTextModel, Gemma3nTextConfig
+
+    >>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration
+    >>> configuration = Gemma3nTextConfig()
+
+    >>> # Initializing a model from the gemma3n_text-E4B style configuration
+    >>> model = Gemma3nTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3n_text"
+
+    def __init__(
+        self,
+        vocab_size: int = 262_400,
+        vocab_size_per_layer_input: int = 262_144,
+        hidden_size: int = 2048,
+        hidden_size_per_layer_input: int = 256,
+        intermediate_size: Union[int, Sequence[int]] = 16_384,
+        num_hidden_layers: int = 35,
+        num_attention_heads: int = 8,
+        num_key_value_heads: int = 2,
+        head_dim: int = 256,
+        hidden_activation: str = "gelu_pytorch_tanh",
+        max_position_embeddings: int = 32_768,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        pad_token_id: int = 0,
+        eos_token_id: int = 1,
+        bos_token_id: int = 2,
+        rope_theta: float = 1_000_000.0,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_local_base_freq: float = 10_000.0,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        sliding_window: int = 512,
+        layer_types: Optional[Sequence[str]] = None,
+        final_logit_softcapping: float = 30.0,
+        altup_active_idx: int = 0,
+        altup_coef_clip: float = 120.0,
+        altup_correct_scale: bool = True,
+        altup_num_inputs: int = 4,
+        num_kv_shared_layers: int = 15,
+        laurel_rank: int = 64,
+        activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None,
+        **kwargs,
+    ):
+        PretrainedConfig.__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers:
+            raise ValueError(
+                "intermediate_size must have an explicit intermediate size for every layer or one for all layers. "
+                f"Expected {num_hidden_layers} values but got {intsize_len}."
+            )
+        elif not isinstance(intermediate_size, Sequence):
+            intermediate_size = [intermediate_size] * num_hidden_layers
+
+        self.vocab_size = vocab_size
+        self.vocab_size_per_layer_input = vocab_size_per_layer_input
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.layer_types = layer_types
+
+        self.rope_local_base_freq = rope_local_base_freq
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+        if layer_types is None:
+            self.layer_types = [
+                "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)
+            ]
+        else:
+            self.layer_types = layer_types
+
+        layer_type_validation(self.layer_types)
+
+        self.hidden_size_per_layer_input = hidden_size_per_layer_input
+        self.num_kv_shared_layers = num_kv_shared_layers
+
+        self.altup_active_idx = altup_active_idx
+        self.altup_coef_clip = altup_coef_clip
+        self.altup_correct_scale = altup_correct_scale
+        self.altup_num_inputs = altup_num_inputs
+
+        self.laurel_rank = laurel_rank
+
+        if activation_sparsity_pattern is None:
+            num_sparse_layers = 10 if num_hidden_layers > 10 else 0
+            activation_sparsity_pattern = (0.95,) * num_sparse_layers + (0.0,) * (
+                num_hidden_layers - num_sparse_layers
+            )
+
+        if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers:
+            raise ValueError(
+                "activation_sparsity_pattern must have an explicit activation sparsity value for every layer."
+                f"Expected {num_hidden_layers} values but got {len_asp}."
+            )
+        self.activation_sparsity_pattern = activation_sparsity_pattern
+
+
+class Gemma3nAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate
+    an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.,
+    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+
+    Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read
+    the documentation from [`Gemma3nAudioConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 128):
+            Vocabulary size of the additional hard-token embeddings for audio model. These augment the embeddings
+            included in the `Gemma3nTextModel` to provide, e.g., the end of audio and audio soft token placeholder
+            tokens when converting `input_ids` to embeddings in the `Gemma3nForConditionalGeneration` model.
+        vocab_offset (`int`, *optional*, defaults to 262272):
+            Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
+            0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
+        input_feat_size (`int`, *optional*, defaults to 128):
+            The number of channels in each mel-spectrogram frame.
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimension of the hidden representations.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        gradient_clipping (`float`, *optional*, defaults to 10000000000.0):
+            Clipping value used to stablize extremely large gradient values.
+        conf_attention_chunk_size (`int`, *optional*, defaults to 12):
+            The sub-sequence size for local attention processing inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_attention_context_left (`int`, *optional*, defaults to 13):
+            The left context size of the local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_attention_context_right (`int`, *optional*, defaults to 0):
+            The right context size of the local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_attention_logit_cap (`float`, *optional*, defaults to 50.0):
+            Logit cap applied during local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_num_attention_heads (`int`, *optional*, defaults to 8):
+            The number of attention heads in local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_num_hidden_layers (`int`, *optional*, defaults to 12):
+            The number of layers that use local attention inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_conv_kernel_size (`int`, *optional*, defaults to 5):
+            Convolution kernel size for the conformer block inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_reduction_factor (`int`, *optional*, defaults to 4):
+            Reduction factor used in the conformer block inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        conf_residual_weight (`float`, *optional*, defaults to 0.5):
+            Residual connection weight inside the Conformer ("conf") section of the
+            Universal Speech Model.
+        sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`):
+            The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection
+            ("sscp") section of the Universal Speech Model.
+        sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001):
+            Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution
+            Projection ("sscp") section of the Universal Speech Model.
+        sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`):
+            Kernel sizes of the two convolutional layers in the subsample convolution projection  in the Sub-sample
+            Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a
+            tuple of height and width for each layer, where the height corresponds to the time dimension and the width
+            corresponds to the frequency dimension.
+        sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`):
+            Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample
+            Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a
+            tuple of height and width for each layer, where the height corresponds to the time dimension and the width
+            corresponds to the frequency dimension.
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder
+
+    >>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration
+    >>> configuration = Gemma3nAudioConfig()
+
+    >>> # Initializing a model from the gemma3n_audio-E4B style configuration
+    >>> model = Gemma3nAudioEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3n_audio"
+
+    def __init__(
+        self,
+        vocab_size: int = 128,
+        vocab_offset: int = 262_144 + 128,  # text vocab size + vision vocab size
+        input_feat_size: int = 128,
+        hidden_size: int = 1536,
+        rms_norm_eps: float = 1e-6,
+        gradient_clipping: float = 10_000_000_000.0,
+        conf_attention_chunk_size: int = 12,
+        conf_attention_context_left: int = 13,
+        conf_attention_context_right: int = 0,
+        conf_attention_logit_cap: float = 50.0,
+        conf_num_attention_heads: int = 8,
+        conf_num_hidden_layers: int = 12,
+        conf_conv_kernel_size: int = 5,
+        conf_reduction_factor: int = 4,
+        conf_residual_weight: float = 0.5,
+        sscp_conv_channel_size: tuple[int, int] = (128, 32),
+        sscp_conv_group_norm_eps: float = 1e-3,
+        sscp_conv_kernel_size: tuple[tuple[int, int], tuple[int, int]] = (
+            (3, 3),
+            (3, 3),
+        ),
+        sscp_conv_stride_size: tuple[tuple[int, int], tuple[int, int]] = (
+            (2, 2),
+            (2, 2),
+        ),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_feat_size = input_feat_size
+        self.hidden_size = hidden_size
+        self.rms_norm_eps = rms_norm_eps
+        self.vocab_size = vocab_size
+        self.vocab_offset = vocab_offset
+        self.gradient_clipping = gradient_clipping
+        self.conf_attention_chunk_size = conf_attention_chunk_size
+        self.conf_attention_context_left = conf_attention_context_left
+        self.conf_attention_context_right = conf_attention_context_right
+        self.conf_attention_logit_cap = conf_attention_logit_cap
+        self.conf_num_attention_heads = conf_num_attention_heads
+        self.conf_num_hidden_layers = conf_num_hidden_layers
+        self.conf_conv_kernel_size = conf_conv_kernel_size
+        self.conf_reduction_factor = conf_reduction_factor
+        self.conf_residual_weight = conf_residual_weight
+        self.sscp_conv_channel_size = sscp_conv_channel_size
+        self.sscp_conv_group_norm_eps = sscp_conv_group_norm_eps
+        self.sscp_conv_kernel_size = sscp_conv_kernel_size
+        self.sscp_conv_stride_size = sscp_conv_stride_size
+
+
+class Gemma3nVisionConfig(TimmWrapperConfig):
+    r"""
+    This is the configuration class to store the configuration for a timm backbone [`TimmWrapper`]. It is used to
+    instantiate an timm model model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B
+    vision tower, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+
+    Configuration objects inherit from [`Gemma3nVisionConfig`] and can be used to control the model outputs. Read the
+    documentation from [`Gemma3nVisionConfig`] for more information.
+
+    Config loads imagenet label descriptions and stores them in `id2label` attribute, `label2id` attribute for default
+    imagenet models is set to `None` due to occlusions in the label descriptions.
+
+    Args:
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        do_pooling (`bool`, *optional*, defaults to `False`):
+            Whether to do pooling for the last_hidden_state in `TimmWrapper` or not.
+        architecture (`str`, *optional*, defaults to `"mobilenetv5_300m_enc"`):
+            Determines vision architecture for TimmWrapper.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        vocab_size (`int`, *optional*, defaults to 128):
+            Vocabulary size of the additional hard-token embeddings for vision model.
+        vocab_offset (`int`, *optional*, defaults to 262144):
+            Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
+            0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+
+    Example:
+    ```python
+    >>> from transformers import Gemma3nVisionConfig, TimmWrapper
+
+    >>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration
+    >>> configuration = Gemma3nVisionConfig()
+
+    >>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration
+    >>> model = TimmWrapper(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3n_vision"
+
+    def __init__(
+        self,
+        initializer_range: float = 0.02,
+        do_pooling: bool = False,
+        architecture: str = "mobilenetv5_300m_enc",
+        hidden_size: int = 2048,
+        vocab_size: int = 128,
+        vocab_offset: int = 262_144,
+        rms_norm_eps: float = 1e-06,
+        model_args: Optional[dict] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.architecture = architecture
+        self.initializer_range = initializer_range
+        self.do_pooling = do_pooling
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.vocab_offset = vocab_offset
+        self.rms_norm_eps = rms_norm_eps
+
+
+class Gemma3nConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3nForConditionalGeneration`]. It is used to
+    instantiate a Gemma3nForConditionalGeneration according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Gemma3n-E4B.
+
+    e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3nTextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        audio_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom audio config or dict.
+        audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
+            The number of soft tokens per audio clip.
+        vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of soft tokens per image.
+        boi_token_id (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_id (`int`, *optional*, defaults to 262144):
+            The end-of-image token index to wrap the image prompt.
+        image_token_id (`int`, *optional*, defaults to 262145):
+            The image token index to encode the image prompt.
+        boa_token_id (`int`, *optional*, defaults to 256000):
+            The begin-of-audio token index to wrap the audio prompt.
+        eoa_token_id (`int`, *optional*, defaults to 262272):
+            The end-of-audio token index to wrap the audio prompt.
+        audio_token_id (`int`, *optional*, defaults to 262273):
+            The audio token index to encode the audio prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig
+
+    >>> # Initializing a MobileNet vision config, which is loaded from TIMM
+    >>> vision_config = Gemma3nVisionConfig()
+
+    >>> # Initializing a Gemma3n Audio config
+    >>> audio_config = Gemma3nAudioConfig()
+
+    >>> # Initializing a Gemma3n Text config
+    >>> text_config = Gemma3nTextConfig()
+
+    >>> # Initializing a Gemma3n gemma-3-4b style configuration
+    >>> configuration = Gemma3nConfig(text_config, vision_config, audio_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3nTextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3n"
+    sub_configs = {
+        "text_config": Gemma3nTextConfig,
+        "vision_config": Gemma3nVisionConfig,
+        "audio_config": Gemma3nAudioConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3nTextConfig, dict[str, Any]]] = None,
+        vision_config: Optional[Union[Gemma3nVisionConfig, dict[str, Any]]] = None,
+        audio_config: Optional[Union[Gemma3nAudioConfig, dict[str, Any]]] = None,
+        audio_soft_tokens_per_image: int = 188,
+        vision_soft_tokens_per_image: int = 256,
+        boi_token_id: int = 255_999,
+        eoi_token_id: int = 262_144,
+        image_token_id: int = 262_145,
+        boa_token_id: int = 256_000,
+        eoa_token_id: int = 262_272,
+        audio_token_id: int = 262_273,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(text_config, dict):
+            text_config = Gemma3nTextConfig(**text_config)
+        elif text_config is None:
+            text_config = Gemma3nTextConfig()
+            logger.info("text_config is None. Using default Gemma3nTextConfig.")
+
+        if isinstance(vision_config, dict):
+            vision_config = Gemma3nVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = Gemma3nVisionConfig()
+            logger.info("vision_config is None. Using default Gemma3nVisionConfig.")
+
+        if isinstance(audio_config, dict):
+            audio_config = Gemma3nAudioConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = Gemma3nAudioConfig()
+            logger.info("audio_config is None. Using default Gemma3nAudioConfig.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.audio_config = audio_config
+
+        self.audio_soft_tokens_per_image = audio_soft_tokens_per_image
+        self.vision_soft_tokens_per_image = vision_soft_tokens_per_image
+        self.boi_token_id = boi_token_id
+        self.eoi_token_id = eoi_token_id
+        self.image_token_id = image_token_id
+        self.boa_token_id = boa_token_id
+        self.eoa_token_id = eoa_token_id
+        self.audio_token_id = audio_token_id
+        self.initializer_range = initializer_range
+
+
+class Gemma3nModelOutputWithPast(PaligemmaModelOutputWithPast):
+    r"""
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    audio_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
+    """
+
+    audio_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Gemma3nCausalLMOutputWithPast(PaliGemmaCausalLMOutputWithPast):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    audio_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
+    """
+
+    audio_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Gemma3nRMSNorm(Gemma3RMSNorm):
+    def __init__(self, dim: int, eps: float = 1e-6, with_scale: bool = True):
+        super().__init__(dim, eps=eps)
+        del self.weight
+        self.with_scale = with_scale
+
+        if self.with_scale:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.register_buffer("weight", torch.tensor(1.0), persistent=False)
+
+    def _norm(self, x):
+        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = self._norm(x.float()) * self.weight.float()
+        return output.type_as(x)
+
+
+# ==== Audio Encoder ====
+
+
+class Gemma3nAudioRelativePositionEmbedding(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.channels = self.config.hidden_size
+        self.head_dim = self.channels // self.num_heads
+        self.max_backward = max(0, self.config.conf_attention_context_left - 1)
+        self.max_forward = self.config.conf_attention_context_right
+
+        self.pos_proj = nn.Linear(self.channels, self.num_heads * self.head_dim, bias=False)
+
+        min_timescale = 1.0
+        max_timescale = 1.0e4
+        num_timescales = self.channels // 2
+        log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / max(num_timescales - 1, 1)
+        inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales) * -log_timescale_increment)
+        self.register_buffer(
+            "inv_timescales",
+            inv_timescales.float().unsqueeze(0).unsqueeze(0),
+            persistent=False,
+        )
+
+    def _get_timing_signal_1d_pos(self, position: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+        position = position.float().unsqueeze(-1)
+        scaled_time = position * self.inv_timescales.to(device=position.device, dtype=torch.float32)
+        timing_signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=-1)
+        return timing_signal.type(dtype)
+
+    def _relative_shift(
+        self,
+        term_bd_before_shift: torch.Tensor,
+        batch_size: int,
+        num_heads: int,
+        num_query_blocks: int,
+        query_block_size: int,
+        key_context_size: int,
+        max_span_plus_1: int,
+    ) -> torch.Tensor:
+        """Performs the relative shift.
+
+        Args:
+          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
+            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
+            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).
+
+        Returns:
+          Tensor of shape [B, N, U, W, C].
+        """
+        # term_bd_before_shift shape: [B, N, U, W, F_span]
+        # Target shape after shift:  [B, N, U, W, C]
+
+        # Padding amount for the last dimension (F_span) to become (C + 1)
+        # C = key_context_size
+        # F_span = max_span_plus_1
+        pad_amount_last_dim = (key_context_size + 1) - max_span_plus_1
+
+        # PyTorch F.pad expects (pad_left, pad_right, pad_top, pad_bottom ...)
+        # We only pad the last dimension on the right.
+        padding_tuple = (0, pad_amount_last_dim)
+
+        term_bd_padded = nn.functional.pad(term_bd_before_shift, padding_tuple)
+        # Shape after pad: [B, N, U, W, C+1]
+
+        # Reshape for slicing (emulating JAX's behavior)
+        # [B, N, U, W * (C+1)]
+        term_bd_reshaped = term_bd_padded.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size * (key_context_size + 1),
+            )
+        )
+
+        # Slice to effective [B, N, U, W * C]
+        term_bd_sliced = term_bd_reshaped[:, :, :, : query_block_size * key_context_size]
+
+        # Reshape back to [B, N, U, W, C]
+        term_bd_shifted = term_bd_sliced.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size,
+                key_context_size,
+            )
+        )
+        return term_bd_shifted
+
+    def forward(self, queries: torch.Tensor, keys: torch.Tensor) -> torch.Tensor:
+        # queries: [B, U, W, N, H] (batch, num_query_blocks, query_block_size, num_heads, head_dim)
+        # keys:    [B, U, C, N, H] (batch, num_query_blocks, key_context_size, num_heads, head_dim)
+        # C = W + L + R (key_context_size)
+        # F_span = L + R + 1 (max_span + 1)
+
+        batch_size, num_query_blocks, query_block_size, num_heads, head_dim = queries.shape
+        _, _, key_context_size, _, _ = keys.shape
+
+        # Relative positions for sinusoidal embeddings: [L, L-1, ..., -R]
+        # Length is L+R+1 = self.max_span + 1
+        pos_indices = torch.arange(self.max_backward, -self.max_forward - 1, -1, device=queries.device).unsqueeze(
+            0
+        )  # Shape [1, F_span]
+
+        max_span_plus_1 = pos_indices.shape[1]  # F_span
+
+        sin_emb_timing_signal = self._get_timing_signal_1d_pos(
+            pos_indices, dtype=queries.dtype
+        )  # Shape [1, F_span, self.channels]
+
+        # Project sinusoidal embeddings: [1, F_span, self.channels] -> [1, F_span, N*H]
+        projected_sin_emb = self.pos_proj(sin_emb_timing_signal)
+        # Reshape to [1, F_span, N, H] then squeeze to [F_span, N, H]
+        sin_emb = projected_sin_emb.reshape(1, max_span_plus_1, self.num_heads, self.head_dim).squeeze(
+            0
+        )  # Shape [F, N, H]
+
+        # term_ac: Query-Key content interaction
+        # queries: [B, U, W, N, H] -> permute to [B, N, U, W, H] for matmul
+        # keys:    [B, U, C, N, H] -> permute to [B, N, U, H, C] for matmul
+        queries_p = queries.permute(0, 3, 1, 2, 4)  # [B, N, U, W, H]
+        keys_p_t = keys.permute(0, 3, 1, 4, 2)  # [B, N, U, H, C]
+        term_ac = torch.matmul(queries_p, keys_p_t)  # [B, N, U, W, C]
+
+        # term_bd: Query-Position interaction
+        # Original einsum: term_bd_unshifed = torch.einsum('buwnh,fnh->bnuwf', queries, sin_emb)
+        # queries shape: [B, U, W, N, H]
+        # sin_emb shape: [F, N, H]
+        # Target output shape: [B, N, U, W, F]
+
+        # Permute queries to [B, N, U, W, H] for easier broadcasting with sin_emb
+        q_permuted = queries.permute(0, 3, 1, 2, 4)
+
+        # Permute sin_emb to [N, H, F] to prepare for matmul
+        # sin_emb original is [F, N, H]
+        s_permuted = sin_emb.permute(1, 2, 0)  # Shape: [N, H, F]
+
+        # Reshape queries for matmul: [B, N, U*W, H]
+        q_reshaped = q_permuted.reshape(batch_size, num_heads, num_query_blocks * query_block_size, head_dim)
+
+        # Perform matmul: [B, N, U*W, H] @ [N, H, F]
+        # s_permuted ([N, H, F]) will be broadcast to [B, N, H, F]
+        # Result: [B, N, U*W, F]
+        term_bd_unshifed_matmul = torch.matmul(q_reshaped, s_permuted)
+
+        # Reshape to target [B, N, U, W, F]
+        term_bd_unshifed = term_bd_unshifed_matmul.reshape(
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            max_span_plus_1,
+        )
+
+        # Apply relative shift to term_bd_unshifed
+        term_bd_shifted = self._relative_shift(
+            term_bd_unshifed,
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            key_context_size,
+            max_span_plus_1,
+        )  # Shape [B, N, U, W, C]
+
+        return term_ac + term_bd_shifted
+
+
+class Gemma3nAudioAttention(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.hidden_size = self.config.hidden_size
+        self.head_dim = self.hidden_size // self.num_heads
+
+        self.chunk_size = self.config.conf_attention_chunk_size
+        self.max_future_horizon = self.config.conf_attention_context_right
+        self.max_past_horizon = max(0, self.config.conf_attention_context_left - 1)
+        self.attention_logits_soft_cap = self.config.conf_attention_logit_cap
+        self.context_size = self.chunk_size + self.max_past_horizon + self.max_future_horizon
+
+        self.relative_position_embedding = Gemma3nAudioRelativePositionEmbedding(config)
+        self.per_dim_scale = nn.Parameter(torch.zeros((self.head_dim,)))
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+
+        q_scale = self.head_dim**-0.5
+        r_softplus_0 = 1.0 / torch.nn.functional.softplus(torch.tensor(0.0))
+        self.register_buffer("q_scale", (q_scale * r_softplus_0).clone().detach(), persistent=False)
+
+        lower_causal_mask = torch.tril(
+            torch.ones((self.context_size, self.chunk_size), dtype=torch.bool),
+            diagonal=0,
+        ).T
+        upper_causal_mask = torch.tril(
+            torch.ones((self.chunk_size, self.context_size), dtype=torch.bool),
+            diagonal=self.max_past_horizon + self.max_future_horizon,
+        )
+        local_causal_valid_mask = torch.ones((self.chunk_size, self.context_size), dtype=torch.bool)
+        local_causal_valid_mask = local_causal_valid_mask * lower_causal_mask * upper_causal_mask
+        self.register_buffer("local_causal_valid_mask", local_causal_valid_mask, persistent=False)
+
+        self.register_buffer(
+            "softcap",
+            torch.tensor(self.attention_logits_soft_cap).float(),
+            persistent=False,
+        )
+
+    def _pad_dim1(self, x: torch.Tensor, pad_left: int, pad_right: int) -> torch.Tensor:
+        batch, _, *tail_shape = x.shape
+        left = x.new_zeros((batch, pad_left, *tail_shape))
+        right = x.new_zeros((batch, pad_right, *tail_shape))
+        x = torch.cat([left, x, right], dim=1)
+        return x
+
+    def _convert_to_block(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Turns a sequence to non overlapping blocks.
+
+        Args:
+            hidden_states: a tensor of [batch, time, ...].
+
+        Returns:
+            A tensor of [batch, num_blocks, block_size, ...], with necessary
+            paddings,
+            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
+        """
+        shape = hidden_states.shape
+        b, t = shape[:2]
+        num_blocks = (t + self.chunk_size - 1) // self.chunk_size
+
+        if (padding_len := num_blocks * self.chunk_size - t) > 0:
+            hidden_states = self._pad_dim1(hidden_states, 0, padding_len)
+
+        permute_dims = (b, num_blocks, self.chunk_size) + shape[2:]
+        hidden_states = hidden_states.reshape(permute_dims).contiguous()
+        return hidden_states
+
+    def _extract_block_context(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Extracts temporal context for every block.
+
+        Args:
+            hidden_states: a tensor of [batch, time, ...].
+
+        Returns:
+            A tensor of [batch, num_blocks, context_size, ...], with necessary
+            paddings,
+            where context_size = block_size + left_context + right_context,
+            and output[:, i, ...] are x[:, start-left_context:end+right_context,
+            ...],
+            start = i * block_size, end = (i + 1) * block_size.
+        """
+        pad_left = self.max_past_horizon
+        # The JAX equivalent padding for signal.frame with pad_mode='valid' is
+        # (left_context, right_context + block_size - 1) on the time dimension.
+        # PyTorch's _pad_dim1 applies padding symmetrically if only one value is given,
+        # or (pad_dim_start, pad_dim_end) if two are given.
+        # Our _pad_dim1(x, pad_left, pad_right) pads dim -2 (time for [B,T,N,H])
+        # or dim 1 (time for [B,T]).
+        # The current pad_right calculation matches the JAX effective padding.
+        pad_right = self.max_future_horizon + self.chunk_size - 1
+        hidden_states = self._pad_dim1(hidden_states, pad_left, pad_right)
+
+        frame_len = self.context_size
+        frame_step = self.chunk_size
+
+        # Directly use unfold without the subframe_factor logic
+        # x.unfold(dimension, size, step)
+        # dimension=1 (time dimension, assuming x is [B, T_padded, ...])
+        # size=frame_len (context_size)
+        # step=frame_step (chunk_size)
+        x_unfolded = hidden_states.unfold(dimension=1, size=frame_len, step=frame_step)
+
+        # If x was [B, T_padded], x_unfolded is [B, num_blocks, frame_len]
+        # If x was [B, T_padded, N, H], x_unfolded is [B, num_blocks, N, H, frame_len]
+        # We want to match JAX's typical output for such operations which might be
+        # [B, num_blocks, frame_len, N, H] if N, H are present.
+        # The relative_position_embedding expects keys as [B, U, C, N, H].
+        # If x_unfolded is [B, U, N, H, C(frame_len)], we need to move C.
+        if hidden_states.ndim > 2 and x_unfolded.ndim > 3:  # Check if inner dimensions (like N, H) exist
+            # Current shape after unfold for [B, T_pad, N, H] is [B, U, N, H, C]
+            # Target shape for keys in RPE: [B, U, C, N, H]
+            x_unfolded = torch.movedim(x_unfolded, source=-1, destination=2)
+
+        return x_unfolded.contiguous()
+
+    def forward(self, hidden_states: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+        # sl.Dense uses jax.numpy.einsum("...a,abcd->...bcd") and jax.numpy.select()
+        qkv_shape = (*hidden_states.shape[:-1], self.num_heads, self.head_dim)
+        query_states = self.q_proj(hidden_states).reshape(qkv_shape).contiguous()
+        key_states = self.k_proj(hidden_states).reshape(qkv_shape).contiguous()
+        value_states = self.v_proj(hidden_states).reshape(qkv_shape).contiguous()
+
+        per_dim_scale_sp = torch.nn.functional.softplus(self.per_dim_scale)
+
+        broadcast_shape = (1, 1, 1, self.head_dim)
+        per_dim_scale_sp_broadcast = per_dim_scale_sp.view(broadcast_shape)
+        query_states = query_states * self.q_scale * per_dim_scale_sp_broadcast
+
+        batch_size, q_time = query_states.shape[:2]
+
+        query_blocks = self._convert_to_block(query_states)
+        key_blocks = self._extract_block_context(key_states)
+        value_blocks = self._extract_block_context(value_states)
+        num_query_blocks = query_blocks.shape[1]
+
+        # 1. Create a mask indicating originally valid positions.
+        original_valid_mask = ~mask  # True for valid, False for padded
+
+        # 2. Extract blocks from this validity mask.
+        extracted_valid_mask_blocks = self._extract_block_context(original_valid_mask)
+
+        # If subframe_factor was used in _extract_block_context for a [B, T] input mask,
+        # the shape might be [B, U, C/SF, SF]. Reshape to [B, U, C].
+        # batch_size and num_query_blocks are known from query_blocks.
+        # self.context_size is C.
+        if (
+            extracted_valid_mask_blocks.ndim == 4
+            and extracted_valid_mask_blocks.shape[2] * extracted_valid_mask_blocks.shape[3] == self.context_size
+        ):
+            extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape(
+                batch_size, num_query_blocks, self.context_size
+            )
+        # After potential reshape, ensure it's [B, U, C] if it was from a [B,T] mask.
+        # This assertion might be too strict if _extract_block_context handles higher-rank inputs differently,
+        # but for the mask case, this should hold.
+        if extracted_valid_mask_blocks.shape != (
+            batch_size,
+            num_query_blocks,
+            self.context_size,
+        ):
+            raise ValueError(
+                "Shape of extracted_valid_mask_blocks"
+                f" {extracted_valid_mask_blocks.shape} is not ({batch_size},"
+                f" {num_query_blocks}, {self.context_size}) after potential reshape."
+            )
+
+        # 3. Expand dimensions for broadcasting with logits and causal mask.
+        # Target shape for broadcasting with logits [B,N,U,W,C]
+        # extracted_valid_mask_blocks to [B, 1, U, 1, C]
+        condition_from_input_validity = extracted_valid_mask_blocks.unsqueeze(1).unsqueeze(-2)
+
+        # self.local_causal_valid_mask is [W, C], True where allowed by local window.
+        # Expand to [1, 1, 1, W, C]
+        condition_from_causality = self.local_causal_valid_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+
+        # 4. Combine the two conditions.
+        # final_condition will be True where a key is *both* originally valid *and* causally accessible.
+        # Broadcasts to [B, 1, U, W, C]
+        final_condition_for_where = torch.logical_and(
+            condition_from_input_validity,
+            condition_from_causality.to(condition_from_input_validity.device),  # Ensure same device
+        )
+
+        # Embed queries and keys
+        logits = self.relative_position_embedding(query_blocks, key_blocks)
+
+        # Apply attention logit softcap
+        # Ensure softcap is on the same device as logits
+        softcap_val = self.softcap.to(logits.device)
+        logits = logits / softcap_val
+        logits = torch.tanh(logits)
+        logits = logits * softcap_val
+
+        # Apply the combined mask.
+        # final_condition_for_where will broadcast with logits [B,N,U,W,C]
+        logits = torch.where(final_condition_for_where, logits, torch.finfo(logits.dtype).min)
+        probabilities = torch.nn.functional.softmax(logits, dim=-1, dtype=torch.float32).to(dtype=value_blocks.dtype)
+
+        # context_vectors is adapted from jax.numpy.einsum("BNuwc,BucNH->BuwNH", ...)
+        b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape
+        h_dim = value_blocks.shape[-1]
+        prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim)
+        v_bun = value_blocks.permute(0, 1, 3, 2, 4).reshape(-1, c_dim, h_dim)
+        result_bmm = torch.bmm(prob_bun, v_bun)
+        context_vectors = result_bmm.reshape(b_dim, u_dim, n_dim, w_dim, h_dim).permute(0, 1, 3, 2, 4)
+        context_vectors = context_vectors.reshape(
+            (
+                batch_size,
+                num_query_blocks * self.chunk_size,
+                self.num_heads,
+                self.head_dim,
+            )
+        )
+        context_vectors = context_vectors[:, :q_time]
+
+        return context_vectors
+
+
+class Gemma3nAudioCumulativeGroupNorm(nn.Module):
+    """Applies Group Normalization cumulatively over the time dimension.
+
+    This layer normalizes the input by calculating the mean and variance
+    cumulatively over the time dimension (dim 1). The statistics are computed
+    over all feature dimensions (specified by `feature_dims` and `num_channels`)
+    for elements marked as valid by the optional `mask`.
+
+    If a `mask` is provided (True for valid, False for invalid/padded),
+    invalid time steps do not contribute to the statistics calculation, and
+    their corresponding output values are zeroed out.
+
+    Scale and bias, if enabled, are applied per-channel (last dimension).
+    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
+    and `cumulative=True`.
+    """
+
+    def __init__(
+        self,
+        num_channels: int,  # Number of channels (size of the last dimension)
+        feature_dims: Sequence[int],  # Sizes of non-channel feature dimensions, e.g., (H, W) for input [B,T,H,W,C]
+        eps: float = 1e-3,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.feature_dims = tuple(feature_dims)
+        self.eps = eps
+
+        # Scale parameter depends only on the channel dimension
+        self.weight = nn.Parameter(torch.ones(num_channels))
+
+        # Axes for normalization: all dimensions except Batch (0) and Time (1).
+        # For input [B, T, *feature_dims, C], these are dims from 2 onwards.
+        self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Applies cumulative group norm, optionally using a mask.
+
+        Args:
+          hidden_states: Input tensor, shape [B, T, *feature_dims, C].
+
+        Returns:
+          Normalized tensor with the same shape as x.
+        """
+        expected_input_suffix = self.feature_dims + (self.num_channels,)
+        if hidden_states.shape[2:] != expected_input_suffix:
+            raise ValueError(
+                f"Input tensor shape suffix {hidden_states.shape[2:]} does not match expected"
+                f" suffix (feature_dims + num_channels) {expected_input_suffix}"
+            )
+
+        input_dtype = hidden_states.dtype
+        # Calculations are performed in float32 for numerical stability.
+        calc_dtype = torch.float32
+        x_calc = hidden_states.to(calc_dtype)
+
+        # Prepare a broadcastable mask (`mask_calc`).
+        # If no mask is provided, treat all elements as valid
+        # (mask_calc is all ones).
+        # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting.
+        mask_calc = torch.ones_like(x_calc, dtype=calc_dtype)
+
+        # Cumulative Statistics Calculation
+        # 1. Sum of values over reduction axes at each time step.
+        sum_values_at_t = torch.sum(x_calc, dim=self.reduction_axes, keepdim=True)
+        # 2. Cumulative sum of values over time.
+        cum_sum_values = torch.cumsum(sum_values_at_t, dim=1)
+
+        # 3. Count of valid elements in the normalization group at each time step.
+        #    (A "group" here consists of all features at a given Batch, Time).
+        elements_in_group_at_t = torch.sum(mask_calc, dim=self.reduction_axes, keepdim=True)
+        # 4. Cumulative count of valid elements over time.
+        cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1)
+        # Avoid division by zero if all preceding elements were masked.
+        safe_cum_count_elements = torch.clamp(cum_count_elements, min=1.0)
+
+        # 5. Cumulative mean.
+        cum_mean = cum_sum_values / safe_cum_count_elements
+
+        # 6. Sum of squared differences from the cumulative mean.
+        #    Only sum for valid elements: (x_calc - cum_mean)^2 * mask_calc.
+        #    Using x_calc here for the difference, as cum_mean already accounts for masking.
+        squared_diff_from_mean = (x_calc - cum_mean).pow(2)
+        sum_sq_diff_at_t = torch.sum(squared_diff_from_mean, dim=self.reduction_axes, keepdim=True)
+
+        # 7. Cumulative sum of squared differences over time.
+        cum_sum_sq_diff = torch.cumsum(sum_sq_diff_at_t, dim=1)
+
+        # 8. Cumulative variance.
+        cum_variance = cum_sum_sq_diff / safe_cum_count_elements
+
+        # Normalize the input using the calculated cumulative statistics:
+        # (x - E[x]) / sqrt(Var[x] + eps)
+        normalized_x = (x_calc - cum_mean) * torch.rsqrt(cum_variance + self.eps)
+
+        # Apply affine transformation (scale and bias) if enabled.
+        # Scale and bias are applied per-channel (last dimension).
+        scale = self.weight.to(calc_dtype)
+        # Reshape for broadcasting: [C] -> [1, ..., 1, C]
+        scale_view_shape = [1] * (hidden_states.dim() - 1) + [self.num_channels]
+        normalized_x = normalized_x * scale.view(scale_view_shape)
+
+        # Zero out outputs for time steps that were originally masked (where mask_calc is 0).
+        # This ensures padded/invalid positions in the input result in zero output.
+        final_output = normalized_x * mask_calc
+
+        return final_output.to(input_dtype)
+
+
+class Gemma3nAudioSSCPConvBlock(nn.Module):
+    """A single convolution block for the SubSampleConvProjection.
+
+    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
+    and a ReLU activation. It handles manual padding for the convolution.
+    """
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        idx: int,
+        input_freq_dim: int,  # Changed from input_spatial_dim
+        manual_padding: tuple[int, int, int, int] = (0, 0, 0, 0),
+    ):
+        super().__init__()
+        self.config = config
+        self.manual_padding = manual_padding
+
+        # in_channels is 1 for the first block, or C_out from previous block's conv
+        in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1]
+        out_channels = self.config.sscp_conv_channel_size[idx]
+        kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx]
+        stride_h, stride_w = self.config.sscp_conv_stride_size[idx]
+
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(
+                kernel_h,
+                kernel_w,
+            ),  # Kernel (kH, kW) operates on (Time, Freq_dim)
+            stride=(stride_h, stride_w),
+            padding=(0, 0),  # Manual padding is used
+            bias=False,
+        )
+
+        # Calculate output frequency dimension (f_out_conv) after this convolution.
+        # input_freq_dim is the unpadded width (feature dimension).
+        # self.manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom)
+        f_in_padded = input_freq_dim + self.manual_padding[0] + self.manual_padding[1]
+        f_out_conv = (f_in_padded - kernel_w) // stride_w + 1
+
+        self.norm = Gemma3nAudioCumulativeGroupNorm(
+            num_channels=out_channels,  # Channels of the conv output
+            feature_dims=(f_out_conv,),  # The frequency dimension size after conv
+            eps=self.config.sscp_conv_group_norm_eps,
+        )
+
+        self.activation = nn.ReLU()
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        # Input audio_encodings is [B, C_in, T_in, F_in] (e.g., C_in=1)
+        # manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom)
+        # F.pad applies to last two dims: F_in then T_in
+        audio_encodings_padded = F.pad(audio_encodings, self.manual_padding, mode="constant", value=0.0).to(
+            self.conv.weight.dtype
+        )
+        # Expected padded shape for F_in, k_w=3, pad_F=(1,1) -> F_padded = F_in+2
+        # Expected padded shape for T_in, k_h=3, pad_T=(0,2) -> T_padded = T_in+2
+        audio_encodings_conv = self.conv(audio_encodings_padded)
+        # Expected conv output shape: [B, C_out, T_out, F_out]
+        # Input to norm is [B, T_out, F_out, C_out]
+        x_for_norm = audio_encodings_conv.permute(0, 2, 3, 1).contiguous()
+        x_normed = self.norm(x_for_norm)
+        # Output of norm is [B, T_out, F_out, C_out], permute back to [B, C_out, T_out, F_out]
+        audio_encodings_normed = x_normed.permute(0, 3, 1, 2).contiguous()
+        return self.activation(audio_encodings_normed)
+
+
+class Gemma3nAudioSubSampleConvProjection(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        current_f_for_block_input = config.input_feat_size  # Start with original feature dim
+        calculated_block_padding = []
+        calculated_f_out_dims = []  # Tracking frequency dimension output sizes
+
+        for i in range(2):  # Assuming 2 conv layers as per sscp_conv_... arrays
+            kernel_h, kernel_w = config.sscp_conv_kernel_size[i]
+            stride_h, stride_w = config.sscp_conv_stride_size[i]
+
+            # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like
+            # JAX 'reverse_causal' padding is (0, kernel_size - 1)
+            pad_t_top = 0
+            pad_t_bottom = kernel_h - 1
+
+            # Frequency Padding (Width for Conv2d)
+            # Based on JAX effective padding (1,1) for F_in=10, K_w=3, S_w=2
+            # and the successful test configuration.
+            # If kernel/stride/input_freq for frequency changes, this might need re-evaluation
+            # to match generic JAX 'SAME' behavior if it differs.
+            pad_f_left = 1
+            pad_f_right = 1
+
+            manual_padding_tuple = (
+                pad_f_left,
+                pad_f_right,
+                pad_t_top,
+                pad_t_bottom,
+            )
+            calculated_block_padding.append(manual_padding_tuple)
+
+            # Calculate output frequency dimension after this convolution
+            # This uses the actual padding applied and kernel/stride.
+            f_in_padded = current_f_for_block_input + pad_f_left + pad_f_right
+            f_out_after_conv = (f_in_padded - kernel_w) // stride_w + 1  # Assuming dilation_w = 1
+            calculated_f_out_dims.append(f_out_after_conv)
+            current_f_for_block_input = f_out_after_conv
+
+        self.conv_0 = Gemma3nAudioSSCPConvBlock(
+            idx=0,
+            input_freq_dim=config.input_feat_size,  # Pass original feature dim
+            config=config,
+            manual_padding=calculated_block_padding[0],
+        )
+        self.conv_1 = Gemma3nAudioSSCPConvBlock(
+            idx=1,
+            input_freq_dim=calculated_f_out_dims[0],  # Output freq dim from conv_0
+            config=config,
+            manual_padding=calculated_block_padding[1],
+        )
+        final_c_out = config.sscp_conv_channel_size[-1]
+        final_f_out = calculated_f_out_dims[-1]  # Final frequency dimension
+        self.input_proj_in_features = final_c_out * final_f_out
+        self.input_proj_linear = nn.Linear(self.input_proj_in_features, self.config.hidden_size, bias=False)
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        # audio_encodings is [B, T, F_in]
+        # Reshape to [B, 1, T, F_in] (Batch, Channels=1, Height=Time, Width=F_in)
+        audio_encodings_reshaped = audio_encodings.unsqueeze(1)
+        x = self.conv_0(audio_encodings_reshaped)
+        x = self.conv_1(x)
+        # x from conv_1 is [B, C_out_1, T_out_1, F_out_1]
+        b, c_out, t_out, f_out = x.shape
+        # Permute to [B, T_out_1, F_out_1, C_out_1] then flatten F_out_1 and C_out_1
+        x_permuted = x.permute(0, 2, 3, 1).contiguous()
+        output_flattened = x_permuted.view(b, t_out, f_out * c_out)
+        output = self.input_proj_linear(output_flattened)
+        return output
+
+
+class Gemma3nAudioConformerAttention(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+        self.post_in_features = self.config.hidden_size
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+        self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.attn = Gemma3nAudioAttention(config)
+        self.post = nn.Linear(self.post_in_features, self.config.hidden_size, bias=False)
+        self.post_norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor:
+        audio_encodings_input_to_attn = audio_encodings
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings_norm = self.pre_attn_norm(audio_encodings)
+        # Output of self.attn is [B, T, NumHeads, HeadDim]
+        audio_encodings_attn_out = self.attn(audio_encodings_norm, audio_mel_mask)
+
+        # Reshape from [B, T, NumHeads, HeadDim] to [B, T, NumHeads * HeadDim]
+        # NumHeads * HeadDim = hidden_size
+        b, t, num_heads, head_dim = audio_encodings_attn_out.shape
+        audio_encodings_reshaped = audio_encodings_attn_out.reshape(b, t, num_heads * head_dim)
+
+        audio_encodings = self.post(audio_encodings_reshaped)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        return audio_encodings_input_to_attn + self.post_norm(audio_encodings)
+
+
+class Gemma3nAudioConformerFeedForward(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+
+        self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.ffw_layer_1 = nn.Linear(self.config.hidden_size, self.config.hidden_size * 4, bias=False)
+        self.ffw_layer_2 = nn.Linear(self.config.hidden_size * 4, self.config.hidden_size, bias=False)
+        self.post_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.post_layer_scale = torch.tensor(self.config.conf_residual_weight)
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        residual = audio_encodings
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings: torch.Tensor = self.ffw_layer_1(audio_encodings)
+        audio_encodings = nn.functional.silu(audio_encodings)
+        audio_encodings: torch.Tensor = self.ffw_layer_2(audio_encodings)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings = self.post_layer_norm(audio_encodings)
+        return residual + (audio_encodings * self.post_layer_scale)
+
+
+class Gemma3nAudioConformerLightConv1d(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.linear_start = nn.Linear(self.config.hidden_size, self.config.hidden_size * 2, bias=False)
+        self.depthwise_conv1d = nn.Conv1d(
+            in_channels=self.config.hidden_size,
+            out_channels=self.config.hidden_size,
+            kernel_size=self.config.conf_conv_kernel_size,
+            stride=1,
+            padding=0,  # Manual causal padding
+            groups=self.config.hidden_size,  # Depthwise
+            bias=False,
+        )
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+        self.conv_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.linear_end = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False)
+
+        self.causal_padding = self.config.conf_conv_kernel_size - 1
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_residual = audio_encodings  # Save for residual connection
+
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings = self.linear_start(audio_encodings)
+        audio_encodings = torch.nn.functional.glu(audio_encodings, dim=-1)
+        # Permute for Conv1d: [B, T, D] -> [B, D, T]
+        audio_encodings_permuted = audio_encodings.permute(0, 2, 1)
+        # Apply manual causal padding
+        audio_encodings_permuted_padded = F.pad(audio_encodings_permuted, (self.causal_padding, 0))
+        audio_encodings = self.depthwise_conv1d(audio_encodings_permuted_padded)
+        # Permute back: [B, D, T_out] -> [B, T_out, D]
+        audio_encodings = audio_encodings.permute(0, 2, 1)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        audio_encodings = self.conv_norm(audio_encodings)
+        audio_encodings = nn.functional.silu(audio_encodings)
+        audio_encodings = self.linear_end(audio_encodings)
+        output = audio_encodings + audio_encodings_residual
+        return output
+
+
+class Gemma3nAudioConformerBlock(nn.Module):
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__()
+        self.config = config
+
+        self.ffw_layer_start = Gemma3nAudioConformerFeedForward(self.config)
+        self.attention = Gemma3nAudioConformerAttention(self.config)
+        self.lconv1d = Gemma3nAudioConformerLightConv1d(self.config)
+        self.ffw_layer_end = Gemma3nAudioConformerFeedForward(self.config)
+        self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False)
+        self.norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor:
+        audio_encodings = self.ffw_layer_start(audio_encodings)
+        audio_encodings = self.attention(audio_encodings, audio_mel_mask)
+        validity_mask_for_lconv = ~audio_mel_mask  # True for valid
+        audio_encodings_for_lconv_input = audio_encodings * validity_mask_for_lconv.unsqueeze(-1).to(
+            audio_encodings.dtype
+        )
+        audio_encodings = self.lconv1d(audio_encodings_for_lconv_input)
+
+        audio_encodings = self.ffw_layer_end(audio_encodings)
+        audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping)
+        output = self.norm(audio_encodings)
+        return output
+
+
+class Gemma3nAudioEncoder(PreTrainedModel):
+    """
+    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
+    """
+
+    config: Gemma3nAudioConfig
+
+    main_input_name = "audio_mel"
+
+    def __init__(self, config: Gemma3nAudioConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.subsample_conv_projection = Gemma3nAudioSubSampleConvProjection(config)
+        self.conformer = nn.ModuleList(
+            [Gemma3nAudioConformerBlock(config) for _ in range(config.conf_num_hidden_layers)]
+        )
+
+    def forward(
+        self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> tuple[torch.Tensor, torch.BoolTensor]:
+        """Encodes a batch of MELs.
+
+        Args:
+            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
+              mel_bins].
+
+        Returns:
+            audio_encodings: a torch.Tensor of shape
+                `[batch_size, self.config.audio_soft_tokens_per_image,
+                self.config.audio_config.hidden_size]`
+            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
+        """
+        audio_encodings = self.subsample_conv_projection(audio_mel)  # audio_encodings: [B, T_sub, D]
+
+        # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub)
+        t_sub = audio_encodings.shape[1]
+
+        time_stride_product = 1
+        for stride_pair_idx in range(len(self.config.sscp_conv_stride_size)):
+            time_stride_product *= self.config.sscp_conv_stride_size[stride_pair_idx][0]
+
+        # Create indices for gathering from the original mask.
+        # These indices map to original time steps corresponding to the start of each
+        # receptive field in the subsampled output.
+        indices = torch.arange(t_sub, device=audio_mel_mask.device) * time_stride_product
+        indices = torch.clamp(indices, max=audio_mel_mask.shape[1] - 1)  # Ensure indices are valid
+
+        # Expand indices for batch compatibility if B > 1 and indices is 1D.
+        if audio_mel_mask.ndim > 1 and indices.ndim == 1:
+            indices = indices.unsqueeze(0).expand(audio_mel_mask.shape[0], -1)  # [B, T_sub]
+        elif (
+            audio_mel_mask.ndim == indices.ndim
+            and audio_mel_mask.shape[0] == 1
+            and indices.shape[0] != 1
+            and t_sub == indices.shape[0]
+        ):
+            # Handle case where B=1 but indices became [T_sub] instead of [1, T_sub]
+            indices = indices.unsqueeze(0)
+
+        current_mask = torch.gather(audio_mel_mask, 1, indices)  # [B, T_sub]
+
+        for block in self.conformer:
+            audio_encodings = block(audio_encodings, current_mask)  # Pass the processed mask
+
+        if self.config.conf_reduction_factor > 1:
+            audio_encodings = audio_encodings[:, :: self.config.conf_reduction_factor]
+            # Reduce the mask as well
+            current_mask = current_mask[:, :: self.config.conf_reduction_factor]
+
+        audio_encodings = audio_encodings.masked_fill(current_mask.unsqueeze(-1), 0.0)
+        return audio_encodings, current_mask
+
+
+# ==== Language Model ====
+
+
+class Gemma3nTextScaledWordEmbedding(Gemma3TextScaledWordEmbedding):
+    pass
+
+
+class Gemma3nTextLaurelBlock(nn.Module):
+    """Learned Augmented Residual Layer"""
+
+    def __init__(self, config: Gemma3nTextConfig):
+        super().__init__()
+        self.config = config
+
+        self.linear_left = nn.Linear(self.config.hidden_size, self.config.laurel_rank, bias=False)
+        self.linear_right = nn.Linear(self.config.laurel_rank, self.config.hidden_size, bias=False)
+        self.post_laurel_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        laurel_hidden_states: torch.Tensor = self.linear_left(hidden_states)
+        laurel_hidden_states: torch.Tensor = self.linear_right(laurel_hidden_states)
+        normed_laurel_hidden_states = self.post_laurel_norm(laurel_hidden_states)
+        return hidden_states + normed_laurel_hidden_states
+
+
+class Gemma3nTextMLP(Gemma2MLP):
+    def __init__(self, config: Gemma3nTextConfig, layer_idx: int = 0):
+        super().__init__(config)
+        self.intermediate_size = config.intermediate_size[layer_idx]
+        self.activation_sparsity = config.activation_sparsity_pattern[layer_idx]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_proj = self.gate_proj(hidden_states)
+        if self.activation_sparsity > 0.0:
+            gate_proj = self._gaussian_topk(gate_proj)
+        activations = self.act_fn(gate_proj)
+        up_proj = self.up_proj(hidden_states)
+        down_proj = self.down_proj(activations * up_proj)
+        return down_proj
+
+    def _gaussian_topk(self, inputs: torch.Tensor) -> torch.Tensor:
+        target_sparsity_tensor = torch.tensor(self.activation_sparsity, dtype=torch.float32, device=inputs.device)
+        # normal_dist and std_multiplier are adapted from jax.scipy.stats.norm.ppf().
+        #
+        # References:
+        #   *   https://docs.jax.dev/en/latest/_autosummary/jax.scipy.stats.norm.ppf.html
+        #   *   https://pytorch.org/docs/stable/distributions.html#torch.distributions.normal.Normal
+        #   *   https://pytorch.org/docs/stable/distributions.html#torch.distributions.transformed_distribution.TransformedDistribution.icdf
+        normal_dist = torch.distributions.normal.Normal(0, 1)
+        std_multiplier: torch.Tensor = normal_dist.icdf(target_sparsity_tensor)
+        std_multiplier = std_multiplier.type(inputs.dtype)
+        inputs_mean = torch.mean(inputs, dim=-1, keepdim=True)
+        inputs_std = torch.std(inputs, dim=-1, keepdim=True, unbiased=False)
+        cutoff_x = inputs_mean + inputs_std * std_multiplier
+        return nn.functional.relu(inputs - cutoff_x)
+
+
+class Gemma3nTextAltUp(nn.Module):
+    """Alternating Updates (AltUp)
+
+    The AltUp module wraps transformer layers. The `predict` step modifies the
+    input to the transformer layer, and the `correct` step propagates the output
+    of the transformer layer to the sparsely updated dimensions.
+
+    See more in the research paper:
+
+    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
+    """
+
+    def __init__(self, config: Gemma3nTextConfig):
+        super().__init__()
+        self.config = config
+        self.correct_output_scale = nn.Parameter(torch.zeros(self.config.hidden_size))
+        self.correction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs, bias=False)
+        self.prediction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs**2, bias=False)
+        self.modality_router = nn.Linear(self.config.hidden_size, self.config.altup_num_inputs, bias=False)
+        self.router_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        self.register_buffer("router_input_scale", torch.tensor(self.config.hidden_size**-1.0), persistent=False)
+
+    def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
+        router_inputs = self.router_norm(x) * self.router_input_scale
+        routed = self.modality_router(router_inputs)
+        return torch.tanh(routed.float()).type_as(x)
+
+    def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Predicts the output of a layer using a trainable map.
+
+        Args:
+            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
+                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
+
+        Returns:
+            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
+        """
+        modalities = self.compute_router_modalities(hidden_states[self.config.altup_active_idx])
+
+        if self.training and self.config.altup_coef_clip is not None:
+            self.prediction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip)
+
+        # Project and then transpose all 2D matrices contained so that mulmat gives the correct result
+        all_coefs: torch.Tensor = (
+            self.prediction_coefs(modalities)
+            .reshape(*modalities.shape[:-1], self.config.altup_num_inputs, self.config.altup_num_inputs)
+            .permute(0, 1, 3, 2)
+        )
+
+        # permute hidden_states to [batch_size, num_tokens, hidden_size, altup_num_inputs]
+        predictions = torch.matmul(hidden_states.permute(1, 2, 3, 0), all_coefs)
+        predictions = predictions.permute(3, 0, 1, 2)  # undo the permute
+        predictions += hidden_states  # add the original input
+        return predictions.contiguous().type_as(hidden_states)
+
+    def correct(self, predictions: torch.Tensor, activated: torch.Tensor) -> torch.Tensor:
+        """Corrects the predictions relative to the
+
+        Args:
+            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
+                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
+            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.
+
+        Returns:
+            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
+                predictions relative to the activated input embeddings.
+        """
+        modalities = self.compute_router_modalities(activated)
+        innovation = activated - predictions[self.config.altup_active_idx]  # (batch, num_tokens, hidden_size)
+        innovation = innovation.repeat(self.config.altup_num_inputs, 1, 1, 1)  # Repeat on dim0 to match predictions
+
+        if self.config.altup_coef_clip is not None:
+            self.correction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip)
+
+        # all_coefs adapted from jax.numpy.einsum("...p,pi->...i", ...)
+        # Permute to (altup_num_inputs, batch_size, num_tokens) as the last dim is a scalar applied to each altup input
+        # and expand on dim1 for broadcastability
+        all_coefs: torch.Tensor = self.correction_coefs(modalities) + 1.0
+        all_coefs = all_coefs.permute(2, 0, 1).unsqueeze(-1)
+
+        corrected = torch.mul(innovation, all_coefs)
+        corrected += predictions  # add the original input
+        return corrected.contiguous().type_as(activated)
+
+    def forward(self, corrected: torch.Tensor) -> torch.Tensor:
+        """
+        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
+        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
+        `scale_corrected_output`
+        """
+        return (corrected.type_as(self.correct_output_scale) * self.correct_output_scale).type_as(corrected)
+
+    def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
+        """Scales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size]."""
+        return self.forward(corrected)
+
+
+class Gemma3nTextRotaryEmbedding(Gemma2RotaryEmbedding):
+    pass
+
+
+def apply_rotary_pos_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    position_ids: Optional[torch.Tensor] = None,
+    unsqueeze_dim: int = 1,
+):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        x (`torch.Tensor`): The tensor to embed.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+class Gemma3nTextAttention(Gemma3Attention):
+    def __init__(self, config: Gemma3nTextConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        del self.attn_logit_softcapping
+        del self.scaling
+        self.v_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps, with_scale=False)
+
+        first_kv_shared_layer_idx = self.config.num_hidden_layers - self.config.num_kv_shared_layers
+        self.is_kv_shared_layer = layer_idx >= first_kv_shared_layer_idx > 0
+        prev_layers = config.layer_types[:first_kv_shared_layer_idx]
+        if self.is_kv_shared_layer:
+            # For shared layers, find the last non-shared layer of the same type before sharing starts
+            self.kv_shared_layer_index = len(prev_layers) - 1 - prev_layers[::-1].index(config.layer_types[layer_idx])
+            self.store_full_length_kv = False
+        else:
+            self.kv_shared_layer_index = None
+            # For non-shared layers, store full-length kv if this is the last non-shared layer of its type
+            self.store_full_length_kv = layer_idx == len(prev_layers) - 1 - prev_layers[::-1].index(
+                config.layer_types[layer_idx]
+            )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.config.head_dim)
+
+        cos, sin = position_embeddings
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        query_states = self.q_norm(query_states)
+        query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2)
+        query_states = query_states.transpose(1, 2)
+
+        # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer
+        if self.is_kv_shared_layer and past_key_values is not None:
+            key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index]
+            # Device of past layer may be different from current one
+            key_states = key_states.to(query_states.device)
+            value_states = value_states.to(query_states.device)
+        else:
+            key_states = self.k_proj(hidden_states).view(hidden_shape)
+            key_states = self.k_norm(key_states)
+            key_states = apply_rotary_pos_emb(key_states, cos, sin, unsqueeze_dim=2)
+            key_states = key_states.transpose(1, 2)
+
+            value_states = self.v_proj(hidden_states).view(hidden_shape)
+            value_states = self.v_norm(value_states)
+            value_states = value_states.transpose(1, 2)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+                "sliding_window": self.sliding_window,
+            }
+            if not self.is_kv_shared_layer:
+                key_states, value_states = past_key_values.update(
+                    key_states, value_states, self.layer_idx, cache_kwargs
+                )
+            if self.store_full_length_kv:
+                if not hasattr(past_key_values, "shared_layers"):
+                    past_key_values.shared_layers = {}
+                past_key_values.shared_layers[self.layer_idx] = key_states, value_states
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=1.0,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3nTextDecoderLayer(Gemma3DecoderLayer):
+    def __init__(self, config: Gemma3nTextConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.mlp = Gemma3nTextMLP(config, layer_idx=layer_idx)
+
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+        self.altup = Gemma3nTextAltUp(config)
+        self.laurel = Gemma3nTextLaurelBlock(config)
+        self.self_attn = Gemma3nTextAttention(config, layer_idx)
+        self.per_layer_input_gate = nn.Linear(self.hidden_size, self.hidden_size_per_layer_input, bias=False)
+        self.per_layer_projection = nn.Linear(self.hidden_size_per_layer_input, self.hidden_size, bias=False)
+        self.post_per_layer_input_norm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        per_layer_input: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        predictions = self.altup.predict(hidden_states)
+        active_prediction = predictions[self.config.altup_active_idx]
+
+        active_prediction_normed = self.input_layernorm(active_prediction)
+        laurel_output = self.laurel(active_prediction_normed)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        attn, self_attn_weights = self.self_attn(
+            hidden_states=active_prediction_normed,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        attn = self.post_attention_layernorm(attn)
+
+        attn_gated = active_prediction + attn
+        attn_laurel = (attn_gated + laurel_output) / math.sqrt(2)
+
+        attn_norm = self.pre_feedforward_layernorm(attn_laurel)
+        attn_ffw = self.mlp(attn_norm)
+        attn_ffw_norm = self.post_feedforward_layernorm(attn_ffw)
+        attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm
+        corrected_predictions = self.altup.correct(predictions, attn_ffw_laurel_gated)
+
+        first_prediction = corrected_predictions[self.config.altup_active_idx].clone()
+        if self.config.altup_correct_scale:
+            first_prediction = self.altup.scale_corrected_output(first_prediction)
+
+        # per_layer_input_gate adapted from jax.numpy.einsum("btd,dp->btp", ...)
+        first_prediction = self.per_layer_input_gate(first_prediction)
+        first_prediction = self.act_fn(first_prediction)
+        first_prediction = torch.multiply(first_prediction, per_layer_input)
+
+        # per_layer_projection adapted from jax.numpy.einsum("btp,pd->btd", ...)
+        first_prediction = self.per_layer_projection(first_prediction)
+        first_prediction = self.post_per_layer_input_norm(first_prediction)
+        corrected_predictions[1:] += first_prediction
+
+        outputs = (corrected_predictions,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Gemma3nPreTrainedModel(Gemma2PreTrainedModel):
+    config: Gemma3nConfig
+    base_model_prefix = ""
+    _no_split_modules = ["Gemma3nTextDecoderLayer"]
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, Gemma3nAudioCumulativeGroupNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Gemma3nAudioAttention):
+            module.per_dim_scale.data.zero_()
+        elif isinstance(module, Gemma3nTextAltUp):
+            module.correct_output_scale.data.zero_()
+
+
+@auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.")
+class Gemma3nTextModel(Gemma3TextModel):
+    config: Gemma3nTextConfig
+
+    def __init__(self, config: Gemma3nTextConfig):
+        super().__init__(config)
+
+        self.hidden_size = config.hidden_size
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+
+        self.embed_tokens_per_layer = Gemma3nTextScaledWordEmbedding(
+            config.vocab_size_per_layer_input,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            self.padding_idx,
+            embed_scale=config.hidden_size_per_layer_input**0.5,
+        )
+
+        self.per_layer_model_projection = nn.Linear(
+            self.hidden_size,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            bias=False,
+        )
+
+        self.per_layer_projection_norm = Gemma3nRMSNorm(config.hidden_size_per_layer_input, eps=config.rms_norm_eps)
+        self.layers = nn.ModuleList(
+            [Gemma3nTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+        self.norm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.altup_projections = nn.ModuleList(
+            [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)]
+        )
+
+        self.altup_unembed_projections = nn.ModuleList(
+            [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)]
+        )
+
+        self.register_buffer("per_layer_projection_scale", torch.tensor(self.hidden_size**-0.5), persistent=False)
+        self.register_buffer("per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False)
+        self.rotary_emb = Gemma3nTextRotaryEmbedding(config=config)
+
+        # TODO (raushan): Fix this after RoPE refactor. For now we hack it by
+        # reassigning thetas when we want to create a local RoPE layer. Config
+        # defaults should hold values for global RoPE.
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config)
+
+    def get_per_layer_inputs(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        return self.embed_tokens_per_layer(input_ids).reshape(
+            *input_ids.shape,
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+
+    def project_per_layer_inputs(
+        self,
+        inputs_embeds: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        per_layer_projection: torch.Tensor = self.per_layer_model_projection(inputs_embeds)
+        per_layer_projection *= self.per_layer_projection_scale.to(
+            dtype=inputs_embeds.dtype, device=per_layer_projection.device
+        )
+        per_layer_projection = per_layer_projection.reshape(
+            *inputs_embeds.shape[:-1],
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(per_layer_projection)
+
+        if per_layer_inputs is None:
+            return per_layer_projection
+
+        if per_layer_projection.shape != per_layer_inputs.shape:
+            # per-layer inputs are sometimes padded with zeros, slice the relevant embeddings.
+            per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :]
+
+        return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale.to(
+            dtype=inputs_embeds.dtype, device=per_layer_projection.device
+        )
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        r"""
+        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
+            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if input_ids is not None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            per_layer_inputs = self.get_per_layer_inputs(input_ids)
+
+        per_layer_inputs = self.project_per_layer_inputs(inputs_embeds, per_layer_inputs)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states_0 = inputs_embeds
+
+        # Initialize RoPE embeddings
+        position_embeddings_global = self.rotary_emb(hidden_states_0, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states_0, position_ids)
+
+        # Expand hidden_states to support per-layer inputs
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5
+        epsilon_tensor = torch.tensor(1e-5)
+
+        temp_hidden_states = [hidden_states_0]
+        for i in range(1, self.config.altup_num_inputs):
+            # altup_proj adapted from jax.numpy.einsum("btp,pd->btd", ...)
+            altup_proj = self.altup_projections[i - 1](hidden_states_0)
+            current_hidden_state = altup_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
+            new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
+            new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device)))
+            current_hidden_state = current_hidden_state * target_magnitude / new_magnitude
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(temp_hidden_states, dim=0)  # [num_altup_inputs, batch, seq_len, hidden_size]
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            causal_mask = causal_mask_mapping[decoder_layer.attention_type]
+            per_layer_input = per_layer_inputs[:, :, decoder_layer.layer_idx, :]
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings_global,
+                position_embeddings_local,
+                per_layer_input,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        # add hidden states from the last decoder layer (but before reprojecting to stay consistent with layer output)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # Per-layer inputs to single output
+        target_magnitude = torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5
+        temp_hidden_states = [hidden_states[0]]
+        for i in range(1, self.config.altup_num_inputs):
+            # altup_unembed_projections adapted from jax.numpy.einsum("btp,pd->btd", ...)
+            altup_unemb_proj: torch.Tensor = self.altup_unembed_projections[i - 1](hidden_states[i])
+            current_hidden_state = altup_unemb_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device)
+            new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True)
+            new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device)))
+            current_hidden_state = current_hidden_state * target_magnitude / new_magnitude
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(temp_hidden_states)
+        hidden_states = torch.mean(hidden_states, dim=0)
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+@auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.")
+class Gemma3nForCausalLM(Gemma3ForCausalLM):
+    _checkpoint_conversion_mapping = {"model.language_model": "model"}
+    base_model_prefix = "model"
+
+
+class Gemma3nMultimodalEmbedder(nn.Module):
+    """Embeds token ids or soft tokens for multimodal content into language model space."""
+
+    def __init__(
+        self,
+        multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig],
+        text_config: Gemma3nTextConfig,
+    ):
+        super().__init__()
+
+        self.multimodal_hidden_size = multimodal_config.hidden_size
+        self.eps = multimodal_config.rms_norm_eps
+        self.vocab_offset = multimodal_config.vocab_offset
+        self.vocab_size = multimodal_config.vocab_size
+        self.text_hidden_size = text_config.hidden_size
+
+        self.embedding = nn.Embedding(self.vocab_size, self.multimodal_hidden_size)
+        self.hard_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps)
+        self.soft_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps)
+        self.embedding_projection = nn.Linear(self.multimodal_hidden_size, self.text_hidden_size, bias=False)
+        self.embedding_post_projection_norm = Gemma3nRMSNorm(self.text_hidden_size, eps=self.eps, with_scale=False)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Embeds token ids or soft tokens for multimodal content into language model space.
+
+        Args:
+            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
+                `[vocab_offset, vocab_offset + vocab_size)`.
+            inputs_embeds: A torch.Tensor containing the soft tokens to embed.
+
+        Returns:
+            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is not None:
+            emb_norm = self.soft_embedding_norm(inputs_embeds)
+        else:
+            hard_emb = self.embedding(input_ids - self.vocab_offset)
+            emb_norm = self.hard_embedding_norm(hard_emb)
+
+        emb_norm_proj = self.embedding_projection(emb_norm)
+        return self.embedding_post_projection_norm(emb_norm_proj)
+
+
+@auto_docstring(
+    custom_intro="""
+    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
+    language modeling head.
+    """
+)
+class Gemma3nModel(PaliGemmaModel):
+    _checkpoint_conversion_mapping = {}
+
+    def __init__(self, config: Gemma3nConfig):
+        super().__init__(config)
+        del self.multi_modal_projector  # Replaced by Gemma3nVisionEmbedder
+        self.vocab_size_per_layer_input = config.text_config.vocab_size_per_layer_input
+        self.audio_tower = AutoModel.from_config(config.audio_config)
+        self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, config.text_config)
+        self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config, config.text_config)
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(
+            pixel_values=pixel_values, do_pooling=False, return_dict=True
+        ).last_hidden_state
+        # Convert from (batch, channels, height, width) to (batch, height * width, channels) where:
+        # height == width and height * width == Gemma3nConfig.vision_soft_tokens_per_image.
+        vision_outputs = vision_outputs.reshape(
+            vision_outputs.shape[0],
+            self.config.vision_config.hidden_size,
+            self.config.vision_soft_tokens_per_image,
+        ).permute(0, 2, 1)
+        # Normalize and embed the soft tokens into language model space.
+        vision_outputs *= self.config.vision_config.hidden_size**0.5
+        return self.embed_vision(inputs_embeds=vision_outputs)
+
+    def get_placeholder_mask(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_features: Optional[torch.FloatTensor] = None,
+        audio_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_audio_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            ).all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_audio_mask = input_ids == self.config.audio_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0] * image_features.shape[1]}"
+            )
+
+        n_audio_tokens = special_audio_mask.sum()
+        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if audio_features is not None and inputs_embeds[special_audio_mask].numel() != audio_features.numel():
+            raise ValueError(
+                f"Audio features and image tokens do not match: tokens: {n_audio_tokens}, features {audio_features.shape[0] * audio_features.shape[1]}"
+            )
+
+        return special_image_mask, special_audio_mask
+
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,  # text inputs
+        pixel_values: Optional[torch.FloatTensor] = None,  # vision inputs
+        input_features: Optional[torch.FloatTensor] = None,  # audio inputs
+        attention_mask: Optional[torch.Tensor] = None,
+        input_features_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **lm_kwargs,
+    ) -> Gemma3nCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+
+        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")
+
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if input_ids is not None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            # Prepare per-layer inputs from inputs_ids
+            per_layer_inputs_mask = torch.logical_and(input_ids >= 0, input_ids < self.vocab_size_per_layer_input)
+            per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids))
+            per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens)
+
+            # Handle vision tokens (>= embed_vision.vocab_offset and < embed_audio.vocab_offset)
+            vision_mask = torch.logical_and(
+                input_ids >= self.embed_vision.vocab_offset, input_ids < self.embed_audio.vocab_offset
+            )
+            dummy_vision_token_id = self.embed_vision.vocab_offset + self.embed_vision.vocab_size - 1
+            vision_input_ids = torch.where(vision_mask, input_ids, dummy_vision_token_id).to(inputs_embeds.device)
+            vision_embeds = self.embed_vision(input_ids=vision_input_ids)
+            expanded_vision_mask = vision_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds = torch.where(expanded_vision_mask, vision_embeds, inputs_embeds)
+
+            # Handle audio tokens (>= embed_audio.vocab_offset)
+            audio_mask = input_ids >= self.embed_audio.vocab_offset
+            dummy_audio_token_id = self.embed_audio.vocab_offset + self.embed_audio.vocab_size - 1
+            audio_input_ids = torch.where(audio_mask, input_ids, dummy_audio_token_id).to(inputs_embeds.device)
+            audio_embeds = self.embed_audio(input_ids=audio_input_ids)
+            expanded_audio_mask = audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds = torch.where(expanded_audio_mask, audio_embeds, inputs_embeds)
+        else:
+            per_layer_inputs = None
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # Merge text and audio
+        if input_features is not None and input_features_mask is not None:
+            audio_features, audio_mask = self.get_audio_features(input_features, ~input_features_mask)
+
+            # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
+            # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will
+            # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
+            # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
+            # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab.
+            audio_padding_toks = torch.tensor([[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device)
+            audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks)
+            audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features)
+
+            audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
+            extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len
+            extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim)
+
+            audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
+            audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            _, special_audio_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
+
+        outputs = self.language_model(
+            input_ids=None,
+            per_layer_inputs=per_layer_inputs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        return Gemma3nModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values if use_cache else None,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            audio_hidden_states=audio_features if input_features is not None else None,
+        )
+
+    def get_audio_features(
+        self, input_features: torch.Tensor, input_features_mask: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Projects the last hidden state from the audio encoder into language model space.
+
+        Args:
+            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
+               The tensors corresponding to the input audio.
+            input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
+               The attention mask for the input audio.
+
+        Returns:
+            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
+        """
+        audio_outputs, audio_mask = self.audio_tower(input_features, input_features_mask)
+        return self.embed_audio(inputs_embeds=audio_outputs), audio_mask
+
+    def _update_causal_mask(self, **super_kwargs):
+        raise AttributeError("We don't want to inherit it")
+
+
+@auto_docstring(
+    custom_intro="""
+    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
+    head.
+    """
+)
+class Gemma3nForConditionalGeneration(PaliGemmaForConditionalGeneration):
+    _checkpoint_conversion_mapping = {}
+    base_model_prefix = "model"
+
+    @property
+    def audio_tower(self):
+        return self.model.audio_tower
+
+    @property
+    def multi_modal_projector(self):
+        raise AttributeError("Use embed_vision instead of multi_modal_projector.")
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,  # text inputs
+        pixel_values: Optional[torch.FloatTensor] = None,  # vision inputs
+        input_features: Optional[torch.FloatTensor] = None,  # audio inputs
+        attention_mask: Optional[torch.Tensor] = None,
+        input_features_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Gemma3nCausalLMOutputWithPast:
+        r"""
+        input_features_mask (torch.Tensor, *optional*, defaults to None):
+            The attention mask for the input audio.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in
+            `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenizer=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            input_features=input_features,
+            attention_mask=attention_mask,
+            input_features_mask=input_features_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            cache_position=cache_position,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if (final_logit_softcapping := self.config.get_text_config().final_logit_softcapping) is not None:
+            logits = logits / final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        return Gemma3nCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            audio_hidden_states=outputs.audio_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        input_features=None,
+        attention_mask=None,
+        input_features_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, multimodal inputs should be None because input ids do not contain special
+        # tokens anymore. Otherwise multimodal inputs should be passed to model.
+        # NOTE: use_cache=False always needs pixel_values, input_features, and input_features_mask
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["input_features"] = input_features
+            model_inputs["input_features_mask"] = input_features_mask
+
+        return model_inputs
+
+    def _prepare_4d_causal_attention_mask_with_cache_position(self, **super_kwargs):
+        raise AttributeError("Do not inherit _prepare_4d_causal_attention_mask_with_cache_position from PaliGemma")
+
+
+__all__ = [
+    "Gemma3nAudioConfig",
+    "Gemma3nAudioEncoder",
+    "Gemma3nConfig",
+    "Gemma3nForCausalLM",
+    "Gemma3nForConditionalGeneration",
+    "Gemma3nModel",
+    "Gemma3nPreTrainedModel",  # noqa: F822
+    "Gemma3nTextConfig",
+    "Gemma3nTextModel",
+    "Gemma3nVisionConfig",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gemma3n/processing_gemma3n.py b/phivenv/Lib/site-packages/transformers/models/gemma3n/processing_gemma3n.py
new file mode 100644
index 0000000000000000000000000000000000000000..89d2880cd5c33655106de497a65d0e44c9a54bbe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gemma3n/processing_gemma3n.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, make_nested_list_of_images
+from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Gemma3nImagesKwargs(ImagesKwargs):
+    do_convert_rgb: Optional[bool]
+
+
+class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
+    audio_kwargs: AudioKwargs
+    images_kwargs: Gemma3nImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+
+
+class Gemma3nProcessor(ProcessorMixin):
+    """
+    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
+    into a single processor.
+
+    Args:
+        feature_extractor (`Gemma3nAudioFeatureExtractor`):
+            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
+            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
+        image_processor (`SiglipImageProcessorFast`):
+            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
+            with a `pixel_values` feature.
+        tokenizer (`GemmaTokenizerFast`):
+            The text tokenizer for the model.
+        chat_template (`string`, *optional*):
+            A Jinja template for generating text prompts from a set of messages.
+        audio_seq_length (int, *optional*, defaults to 188):
+            The number of audio soft tokens that will be added to the text prompt
+        image_seq_length (int, *optional*, defaults to 256):
+            The number of image soft tokens that should be added to
+    """
+
+    attributes = ["feature_extractor", "image_processor", "tokenizer"]
+    feature_extractor_class = "AutoFeatureExtractor"
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        feature_extractor,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        audio_seq_length: int = 188,
+        image_seq_length: int = 256,
+        **kwargs,
+    ):
+        self.audio_seq_length = audio_seq_length
+        self.audio_token_id = tokenizer.audio_token_id
+        self.boa_token = tokenizer.boa_token
+        self.audio_token = tokenizer.audio_token
+        audio_tokens_expanded = "".join([tokenizer.audio_token] * audio_seq_length)
+        self.full_audio_sequence = f"\n\n{tokenizer.boa_token}{audio_tokens_expanded}{tokenizer.eoa_token}\n\n"
+
+        self.image_seq_length = image_seq_length
+        self.image_token_id = tokenizer.image_token_id
+        self.boi_token = tokenizer.boi_token
+        self.image_token = tokenizer.image_token
+        image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
+        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
+
+        super().__init__(
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] = None,
+        videos=None,
+        **kwargs: Unpack[Gemma3nProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None and images is None and audio is None:
+            raise ValueError("Provide at least one of `text`, `images`, or `audio`.")
+
+        output_kwargs = self._merge_kwargs(
+            Gemma3nProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        if audio is not None:
+            audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
+
+            if not text:
+                text = [self.audio_token for _ in audio]
+
+            # Expand placeholder audio tokens to the full audio token sequence
+            text = [prompt.replace(self.audio_token, self.full_audio_sequence) for prompt in text]
+        else:
+            audio_inputs = {}
+
+        if images is not None:
+            images = self.image_processor.fetch_images(images)
+            batched_images = make_nested_list_of_images(images)
+            image_inputs = self.image_processor(batched_images, **output_kwargs["images_kwargs"])
+
+            # Create empty text to be replaced with placeholders
+            if not text:
+                text = [" ".join([self.image_token] * len(images)) for images in batched_images]
+
+            if len(batched_images) != len(text):
+                raise ValueError(
+                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
+                )
+
+            # Expand placeholder image tokens to the full image token sequence
+            text = [prompt.replace(self.image_token, self.full_image_sequence) for prompt in text]
+        else:
+            image_inputs = {}
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        # Add token type ids manually, as tokenizer can't do arbitrary position token types
+        array_ids = text_inputs["input_ids"]
+        token_type_ids = np.zeros_like(array_ids)
+        token_type_ids[array_ids == self.image_token_id] = 1
+        token_type_ids[array_ids == self.audio_token_id] = 3
+        text_inputs = {k: v.tolist() for k, v in text_inputs.items()}  # in case user requested list inputs
+        text_inputs["token_type_ids"] = token_type_ids.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs}, tensor_type=return_tensors)
+
+
+__all__ = ["Gemma3nProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/git/__init__.py b/phivenv/Lib/site-packages/transformers/models/git/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e3e86927ab7901f1302a87882c4f841f35865d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/git/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_git import *
+    from .modeling_git import *
+    from .processing_git import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/git/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..724574707943883ea5e118698f50ea195549bef0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e0d331cac9291808c3518cd94e2d811a4c53db6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6048c98c100ad2fef13e81b7efd9a407969852de
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/git/__pycache__/processing_git.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/processing_git.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..375c4360f4e22dad7d401aaafd42fd0600c6538b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/git/__pycache__/processing_git.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/git/configuration_git.py b/phivenv/Lib/site-packages/transformers/models/git/configuration_git.py
new file mode 100644
index 0000000000000000000000000000000000000000..86c85854ff98bed91cc926db1daf48fcdb1ffb0e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/git/configuration_git.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GitVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GitVisionModel`]. It is used to instantiate a GIT
+    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the vision encoder of the GIT
+    [microsoft/git-base](https://huggingface.co/microsoft/git-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import GitVisionConfig, GitVisionModel
+
+    >>> # Initializing a GitVisionConfig with microsoft/git-base style configuration
+    >>> configuration = GitVisionConfig()
+
+    >>> # Initializing a GitVisionModel (with random weights) from the microsoft/git-base style configuration
+    >>> model = GitVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "git_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+
+class GitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GitModel`]. It is used to instantiate a GIT model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the GIT
+    [microsoft/git-base](https://huggingface.co/microsoft/git-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`GitVisionConfig`].
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the GIT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GitModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_image_with_embedding (`int`, *optional*):
+            The number of temporal embeddings to add, in case the model is used for video captioning/VQA.
+
+    Examples:
+
+    ```python
+    >>> from transformers import GitConfig, GitModel
+
+    >>> # Initializing a GIT microsoft/git-base style configuration
+    >>> configuration = GitConfig()
+
+    >>> # Initializing a model (with random weights) from the microsoft/git-base style configuration
+    >>> model = GitModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "git"
+    sub_configs = {"vision_config": GitVisionConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=6,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        tie_word_embeddings=False,
+        bos_token_id=101,
+        eos_token_id=102,
+        num_image_with_embedding=None,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the GitVisionConfig with default values.")
+
+        self.vision_config = GitVisionConfig(**vision_config)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.num_image_with_embedding = num_image_with_embedding
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+
+__all__ = ["GitConfig", "GitVisionConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/git/modeling_git.py b/phivenv/Lib/site-packages/transformers/models/git/modeling_git.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b69cf07a0467adbdbd016aa14a094dc0ed046a1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/git/modeling_git.py
@@ -0,0 +1,1464 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GIT model."""
+
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithPast,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    can_return_tuple,
+    logging,
+    torch_int,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_git import GitConfig, GitVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    """
+)
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Git
+class GitVisionModelOutput(ModelOutput):
+    r"""
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The image embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class GitEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            embeddings = self.word_embeddings(input_ids)
+        else:
+            embeddings = inputs_embeds
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class GitSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.image_patch_tokens = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1)
+        if config.num_image_with_embedding is not None:
+            self.image_patch_tokens *= config.num_image_with_embedding
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        cutoff = self.image_patch_tokens if pixel_values_present else 0
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        if past_key_values is not None:
+            # NOTE: like in other caches, we store the text component. In GIT it means we discard the image component.
+            key_layer_past, value_layer_past = past_key_values.update(
+                key_layer[:, :, cutoff:, :], value_layer[:, :, cutoff:, :], self.layer_idx
+            )
+            key_layer = torch.cat([key_layer[:, :, :cutoff, :], key_layer_past], dim=2)
+            value_layer = torch.cat([value_layer[:, :, :cutoff, :], value_layer_past], dim=2)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if past_key_values is not None:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GitModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class GitSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+GIT_SELF_ATTENTION_CLASSES = {
+    "eager": GitSelfAttention,
+}
+
+
+class GitAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type, layer_idx=layer_idx
+        )
+        self.output = GitSelfOutput(config)
+        self.pruned_heads = set()
+
+    # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        attn_output, self_attn_weights = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            past_key_values,
+            output_attentions,
+            pixel_values_present,
+        )
+        attention_output = self.output(attn_output, hidden_states)
+        return attention_output, self_attn_weights
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class GitIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class GitOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class GitLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = GitAttention(config, layer_idx=layer_idx)
+        self.intermediate = GitIntermediate(config)
+        self.output = GitOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        attention_output, self_attention_weights = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            pixel_values_present=pixel_values_present,
+        )
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        return layer_output, self_attention_weights
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class GitEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([GitLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                past_key_values,
+                output_attentions,
+                pixel_values_present,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@auto_docstring
+class GitPreTrainedModel(PreTrainedModel):
+    config: GitConfig
+    base_model_prefix = "git"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, GitVisionEmbeddings):
+            nn.init.normal_(module.class_embedding, mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.patch_embedding.weight, std=self.config.initializer_range)
+            nn.init.normal_(module.position_embedding.weight, std=self.config.initializer_range)
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Git
+class GitVisionEmbeddings(nn.Module):
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
+            )
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class GitVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GitVisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation != "flash_attention_2":
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask
+        else:
+            self.is_causal = causal_attention_mask is not None
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->GitVision
+class GitVisionEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = GitVisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = GitVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->GitVision, CLIPConfig
+class GitVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`GitVisionEncoderLayer`].
+
+    Args:
+        config: GitVisionConfig
+    """
+
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class GitVisionTransformer(nn.Module):
+    # Copied from transformers.models.altclip.modeling_altclip.AltCLIPVisionTransformer.__init__ with AltCLIPEncoder->GitVisionEncoder, AltCLIP->Git
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = GitVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = GitVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The vision model from CLIP, used in GIT, without any head or projection on top.
+    """
+)
+class GitVisionModel(GitPreTrainedModel):
+    config: GitVisionConfig
+    main_input_name = "pixel_values"
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git
+    def __init__(self, config: GitVisionConfig):
+        super().__init__(config)
+        self.vision_model = GitVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GitVisionModel
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
+        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+
+class GitProjection(nn.Module):
+    def __init__(self, config: GitConfig):
+        super().__init__()
+        self.config = config
+        self.visual_projection = nn.Sequential(
+            nn.Linear(config.vision_config.hidden_size, config.hidden_size),
+            nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps),
+        )
+
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
+        return self.visual_projection(embeddings)
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
+    """
+)
+class GitModel(GitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = GitEmbeddings(config)
+        self.image_encoder = GitVisionModel(config.vision_config)
+        self.encoder = GitEncoder(config)
+
+        self.visual_projection = GitProjection(config)
+
+        if config.num_image_with_embedding is not None:
+            self.img_temperal_embedding = nn.ParameterList(
+                nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size))
+                for _ in range(config.num_image_with_embedding)
+            )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def _generate_future_mask(self, size: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+        # Default mask is for forward direction. Flip for backward direction.
+        mask = torch.triu(torch.ones(size, size, device=device, dtype=dtype), diagonal=1)
+        mask = mask.masked_fill(mask == 1, float("-inf"))
+        return mask
+
+    def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None):
+        num_tgt = tgt.shape[1]
+        num_memory = memory.shape[1]
+        device = tgt.device
+        dtype = tgt.dtype
+        top_left = torch.zeros((num_memory, num_memory), device=device, dtype=dtype)
+        top_right = torch.full(
+            (num_memory, num_tgt + past_key_values_length),
+            float("-inf"),
+            device=tgt.device,
+            dtype=dtype,
+        )
+        bottom_left = torch.zeros(
+            (num_tgt, num_memory),
+            dtype=dtype,
+            device=tgt_mask.device,
+        )
+
+        if past_key_values_length > 0:
+            tgt_mask = torch.zeros(
+                (tgt_mask.shape[0], tgt_mask.shape[0] + past_key_values_length),
+                dtype=dtype,
+                device=tgt_mask.device,
+            )
+
+        left = torch.cat((top_left, bottom_left), dim=0)
+        right = torch.cat((top_right, tgt_mask.to(dtype)), dim=0)
+
+        full_attention_mask = torch.cat((left, right), dim=1)[None, :]
+
+        if memory_key_padding_mask is None:
+            memory_key_padding_mask = torch.full((memory.shape[0], memory.shape[1]), fill_value=False, device=device)
+        # if it is False, it means valid. That is, it is not a padding
+        if memory_key_padding_mask.dtype != torch.bool:
+            raise ValueError("Memory key padding mask must be a boolean tensor.")
+        zero_negative_infinity = torch.zeros_like(memory_key_padding_mask, dtype=tgt.dtype)
+        zero_negative_infinity[memory_key_padding_mask] = float("-inf")
+        full_attention_mask = full_attention_mask.expand(
+            (memory_key_padding_mask.shape[0], num_memory + num_tgt, num_memory + past_key_values_length + num_tgt)
+        )
+        full_attention_mask = full_attention_mask.clone()
+        origin_left = full_attention_mask[:, :, :num_memory]
+        update = zero_negative_infinity[:, None, :]
+        full_attention_mask[:, :, :num_memory] = origin_left + update
+
+        # add axis for multi-head
+        full_attention_mask = full_attention_mask[:, None, :, :]
+
+        return full_attention_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
+        >>> model = AutoModel.from_pretrained("microsoft/git-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = "this is an image of two cats"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        seq_length = input_shape[1]
+
+        # past_key_values_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values.get_seq_length()
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        projected_visual_features = None
+        if pixel_values is not None:
+            if pixel_values.ndim == 4:
+                # here we assume pixel_values is of shape (batch_size, num_channels, height, width)
+                visual_features = self.image_encoder(
+                    pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+                ).last_hidden_state
+
+            elif pixel_values.ndim == 5:
+                # here we assume pixel_values is of shape (batch_size, num_frames, num_channels, height, width)
+                visual_features = []
+                for frame_idx in range(pixel_values.shape[1]):
+                    visual_features_frame = self.image_encoder(
+                        pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding
+                    ).last_hidden_state
+                    visual_features_frame += self.img_temperal_embedding[frame_idx]
+                    visual_features.append(visual_features_frame)
+
+                # finally, concatenate all features along sequence dimension
+                visual_features = torch.cat(visual_features, dim=1)
+
+            else:
+                raise ValueError("pixel_values must be of rank 4 or 5")
+
+            projected_visual_features = self.visual_projection(visual_features)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if projected_visual_features is None:
+            projected_visual_features = torch.zeros(
+                (embedding_output.shape[0], 0, embedding_output.shape[2]),
+                dtype=embedding_output.dtype,
+                device=embedding_output.device,
+            )
+
+        # Repeat visual features to match embedding batch size.
+        projected_visual_features = projected_visual_features.repeat(
+            embedding_output.size(0) // projected_visual_features.size(0), 1, 1
+        )
+
+        # concatenate patch token and text token embeddings
+        hidden_states = torch.cat((projected_visual_features, embedding_output), dim=1)
+
+        # By default, an additive causal mask is created
+        # for masking the future (one direction).
+        tgt_mask = self._generate_future_mask(seq_length, embedding_output.dtype, embedding_output.device)
+
+        # Create an attention mask of shape (batch_size, 1, tgt_seq_len, src_seq_len)
+        combined_attention_mask = self.create_attention_mask(
+            tgt=embedding_output,
+            memory=projected_visual_features,
+            tgt_mask=tgt_mask,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is not None:
+            # if the user provides an attention mask, we add it to the default one
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _prepare_4d_attention_mask(
+                attention_mask, embedding_output.dtype, tgt_len=input_shape[-1]
+            ).to(embedding_output.device)
+            if past_key_values_length > 0:
+                expanded_attn_mask = expanded_attn_mask[:, :, -past_key_values_length:, :]
+            else:
+                combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=combined_attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            pixel_values_present=pixel_values is not None,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=sequence_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    GIT Model with a `language modeling` head on top for autoregressive language modeling.
+    """
+)
+class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["output.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.git = GitModel(config)
+        self.output = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.output
+
+    def set_output_embeddings(self, new_embeddings):
+        self.output = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        Image captioning example:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModelForCausalLM
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values
+
+        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
+        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> print(generated_caption)
+        two cats sleeping on a pink blanket next to remotes.
+        ```
+
+        Visual question answering (VQA) example:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModelForCausalLM
+        >>> from huggingface_hub import hf_hub_download
+        >>> from PIL import Image
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")
+
+        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
+        >>> image = Image.open(file_path).convert("RGB")
+
+        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values
+
+        >>> question = "what does the front of the bus say at the top?"
+
+        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
+        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
+        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)
+
+        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
+        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
+        ['what does the front of the bus say at the top? special']
+        ```
+
+        Video captioning example:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> from huggingface_hub import hf_hub_download
+        >>> from transformers import AutoProcessor, AutoModelForCausalLM
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
+
+        >>> # set seed for reproducibility
+        >>> np.random.seed(45)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`list[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`list[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # load video
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample frames
+        >>> num_frames = model.config.num_image_with_embedding
+        >>> indices = sample_frame_indices(
+        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
+        ... )
+        >>> frames = read_video_pyav(container, indices)
+
+        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values
+
+        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
+
+        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
+        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.git(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.output(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            num_image_tokens = self.git.encoder.layer[0].attention.self.image_patch_tokens
+            shifted_logits = logits[:, num_image_tokens:-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss = self.loss_function(
+                shifted_logits.view(-1, self.config.vocab_size),
+                labels.view(-1),
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+    ):
+        # Overwritten -- `git` has special cache handling and doesn't support generating from `inputs_embeds` atm
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        input_shape = input_ids.shape
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": kwargs.get("pixel_values"),
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+
+__all__ = ["GitForCausalLM", "GitModel", "GitPreTrainedModel", "GitVisionModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/git/processing_git.py b/phivenv/Lib/site-packages/transformers/models/git/processing_git.py
new file mode 100644
index 0000000000000000000000000000000000000000..dff60bce1b8460496130dd1c83cc771ba1e231b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/git/processing_git.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for GIT
+"""
+
+from typing import Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+class GitProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
+logger = logging.get_logger(__name__)
+
+
+class GitProcessor(ProcessorMixin):
+    r"""
+    Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor.
+
+    [`GitProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BertTokenizerFast`]. See the
+    [`~GitProcessor.__call__`] and [`~GitProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`AutoImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`AutoTokenizer`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[GitProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        output_kwargs = self._merge_kwargs(
+            GitProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        data = {}
+        if text is not None:
+            text_features = self.tokenizer(text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
+        if images is not None:
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            data.update(image_features)
+
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
+
+
+__all__ = ["GitProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/__init__.py b/phivenv/Lib/site-packages/transformers/models/glm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0636c800beea6b02d16d13098bd7b13f11baf468
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glm import *
+    from .modeling_glm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96668316a30990fc279f8799c04034d0d3cbd2ec
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27fb5a3a809416a6bb46c73520215e285dc0e3ff
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3218a22503b5c02e117df8d271d6998e0de928e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edc20b9aca630c5f40fc0ceb8f0f2b82e6378d47
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/configuration_glm.py b/phivenv/Lib/site-packages/transformers/models/glm/configuration_glm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d61dc6fa1505184735710ac899fb7836aeac1a9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm/configuration_glm.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class GlmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GlmModel`]. It is used to instantiate an Glm
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Glm-4-9b-chat.
+    e.g. [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151552):
+            Vocabulary size of the Glm model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GlmModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1.5625e-07):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        pad_token_id (`int`, *optional*, defaults to 151329):
+            Padding token id.
+        eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`):
+            End of stream token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+    ```python
+    >>> from transformers import GlmModel, GlmConfig
+    >>> # Initializing a Glm glm-4-9b-chat style configuration
+    >>> configuration = GlmConfig()
+    >>> # Initializing a model from the glm-4-9b-chat style configuration
+    >>> model = GlmModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=13696,
+        num_hidden_layers=40,
+        num_attention_heads=32,
+        num_key_value_heads=2,
+        partial_rotary_factor=0.5,
+        head_dim=128,
+        hidden_act="silu",
+        attention_dropout=0.0,
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=0.00000015625,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        pad_token_id=151329,
+        eos_token_id=[151329, 151336, 151338],
+        bos_token_id=None,
+        attention_bias=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.partial_rotary_factor = partial_rotary_factor
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["GlmConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/modeling_glm.py b/phivenv/Lib/site-packages/transformers/models/glm/modeling_glm.py
new file mode 100644
index 0000000000000000000000000000000000000000..59c9f39da5270f9ca7cac1cd9c42e605105a743a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm/modeling_glm.py
@@ -0,0 +1,512 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm/modular_glm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_glm import GlmConfig
+
+
+class GlmMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class GlmAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class GlmRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GlmRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GlmRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GlmConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class GlmDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GlmConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GlmAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = GlmMLP(config)
+        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class GlmPreTrainedModel(PreTrainedModel):
+    config: GlmConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GlmDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": GlmDecoderLayer,
+        "attentions": GlmAttention,
+    }
+
+
+@auto_docstring
+class GlmModel(GlmPreTrainedModel):
+    def __init__(self, config: GlmConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = GlmRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GlmModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GlmForCausalLM
+
+        >>> model = GlmForCausalLM.from_pretrained("meta-glm/Glm-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-glm/Glm-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class GlmForSequenceClassification(GenericForSequenceClassification, GlmPreTrainedModel):
+    pass
+
+
+class GlmForTokenClassification(GenericForTokenClassification, GlmPreTrainedModel):
+    pass
+
+
+__all__ = [
+    "GlmPreTrainedModel",
+    "GlmModel",
+    "GlmForCausalLM",
+    "GlmForSequenceClassification",
+    "GlmForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm/modular_glm.py b/phivenv/Lib/site-packages/transformers/models/glm/modular_glm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec07be10fb6a554ba2460e73054f891eaf59725d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm/modular_glm.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...utils import logging
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaForCausalLM,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+)
+from ..phi3.modeling_phi3 import Phi3MLP
+from .configuration_glm import GlmConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b"
+
+
+class GlmMLP(Phi3MLP):
+    pass
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class GlmAttention(LlamaAttention):
+    def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+
+class GlmForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class GlmForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class GlmForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+__all__ = [
+    "GlmPreTrainedModel",  # noqa: F822
+    "GlmModel",  # noqa: F822
+    "GlmForCausalLM",
+    "GlmForSequenceClassification",
+    "GlmForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/__init__.py b/phivenv/Lib/site-packages/transformers/models/glm4/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e92a8a2b9c9761d39526ccc0c12c26604fa2a49
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glm4 import *
+    from .modeling_glm4 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cca063f8a4e477b697ac7ff739c78c61fbab0da2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70ac86521c7026e2c3ec3a1768d73e50e1de65a1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44cc7cf9a218c1ddc453805e204b89754e4a8eaa
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f363f7d2afb800f81974cee4fd2fda588cb046a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/configuration_glm4.py b/phivenv/Lib/site-packages/transformers/models/glm4/configuration_glm4.py
new file mode 100644
index 0000000000000000000000000000000000000000..46dc929826e4c37be0717600ade76d3f03be53b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4/configuration_glm4.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class Glm4Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4Model`]. It is used to instantiate an Glm4
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Glm4-4-9b-chat.
+    e.g. [THUDM/GLM-4-9B-0414](https://huggingface.co/THUDM/GLM-4-9B-0414)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151552):
+            Vocabulary size of the Glm4 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1.5625e-07):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        pad_token_id (`int`, *optional*, defaults to 151329):
+            Padding token id.
+        eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`):
+            End of stream token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+    ```python
+    >>> from transformers import Glm4Model, Glm4Config
+    >>> # Initializing a Glm4 glm4-4-9b-chat style configuration
+    >>> configuration = Glm4Config()
+    >>> # Initializing a model from the glm4-4-9b-chat style configuration
+    >>> model = Glm4Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=13696,
+        num_hidden_layers=40,
+        num_attention_heads=32,
+        num_key_value_heads=2,
+        partial_rotary_factor=0.5,
+        head_dim=128,
+        hidden_act="silu",
+        attention_dropout=0.0,
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=0.00000015625,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        pad_token_id=151329,
+        eos_token_id=[151329, 151336, 151338],
+        bos_token_id=None,
+        attention_bias=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.partial_rotary_factor = partial_rotary_factor
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Glm4Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/modeling_glm4.py b/phivenv/Lib/site-packages/transformers/models/glm4/modeling_glm4.py
new file mode 100644
index 0000000000000000000000000000000000000000..dafab297f5667610284121a2bfe818bfcc12b3df
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4/modeling_glm4.py
@@ -0,0 +1,521 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4/modular_glm4.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_glm4 import Glm4Config
+
+
+class Glm4MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+class Glm4DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Glm4Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Glm4Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Glm4MLP(config)
+        self.input_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class Glm4Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Glm4Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Glm4RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Glm4RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Glm4RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Glm4Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class Glm4PreTrainedModel(PreTrainedModel):
+    config: Glm4Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Glm4DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Glm4DecoderLayer,
+        "attentions": Glm4Attention,
+    }
+
+
+@auto_docstring
+class Glm4Model(Glm4PreTrainedModel):
+    def __init__(self, config: Glm4Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Glm4DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Glm4RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Glm4ForCausalLM(Glm4PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Glm4Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Glm4ForCausalLM
+
+        >>> model = Glm4ForCausalLM.from_pretrained("THUDM/GLM-4-9B-0414")
+        >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4-9B-0414")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Glm4ForSequenceClassification(GenericForSequenceClassification, Glm4PreTrainedModel):
+    pass
+
+
+class Glm4ForTokenClassification(GenericForTokenClassification, Glm4PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "Glm4PreTrainedModel",
+    "Glm4Model",
+    "Glm4ForCausalLM",
+    "Glm4ForSequenceClassification",
+    "Glm4ForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4/modular_glm4.py b/phivenv/Lib/site-packages/transformers/models/glm4/modular_glm4.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bbc9b601f591d539e1bb529456a39337589f417
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4/modular_glm4.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2025 The GLM4 & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+import torch
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..glm.modeling_glm import GlmAttention, GlmForCausalLM, GlmForSequenceClassification, GlmForTokenClassification
+from ..phi3.modeling_phi3 import Phi3MLP
+from .configuration_glm4 import Glm4Config
+from .modeling_glm4 import Glm4RMSNorm
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/GLM-4-9B-0414"
+
+
+class Glm4MLP(Phi3MLP):
+    pass
+
+
+class Glm4DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Glm4Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Glm4Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Glm4MLP(config)
+        self.input_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = Glm4RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Glm4Attention(GlmAttention):
+    pass
+
+
+class Glm4ForCausalLM(GlmForCausalLM):
+    def forward(
+        self,
+        **super_kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Glm4ForCausalLM
+
+        >>> model = Glm4ForCausalLM.from_pretrained("THUDM/GLM-4-9B-0414")
+        >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4-9B-0414")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        return super().forward(**super_kwargs)
+
+
+class Glm4ForSequenceClassification(GlmForSequenceClassification):
+    pass
+
+
+class Glm4ForTokenClassification(GlmForTokenClassification):
+    pass
+
+
+__all__ = [
+    "Glm4PreTrainedModel",  # noqa: F822
+    "Glm4Model",  # noqa: F822
+    "Glm4ForCausalLM",
+    "Glm4ForSequenceClassification",
+    "Glm4ForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/__init__.py b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fcd5dc9cce65ccc83944e483c5b75a3bbe16cd3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glm4_moe import *
+    from .modeling_glm4_moe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dba921263c56d9cc1620a5f61610417f9ecb3b9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b89cdb6762798b3df84c45bed9ae15bdc6cea63
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27e9c47c56e86c522c29f861ffb8572fe42e651f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f69ed82b966857cf37ccab2da56472c6d240fa3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/configuration_glm4_moe.py b/phivenv/Lib/site-packages/transformers/models/glm4_moe/configuration_glm4_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9937a8ba4943804573e88fe85bfe7414ce11cbc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4_moe/configuration_glm4_moe.py
@@ -0,0 +1,242 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4_moe/modular_glm4_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class Glm4MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4MoeModel`]. It is used to instantiate a
+    Glm4Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of [THUDM/GLM-4-100B-A10B](https://huggingface.co/THUDM/GLM-4-100B-A10B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151552):
+            Vocabulary size of the Glm4Moe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 10944):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 46):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 96):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            The factor of the partial rotary position.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            number of experts per token.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        n_routed_experts (`int`, *optional*, defaults to 128):
+            Number of routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        n_group (`int`, *optional*, defaults to 1):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 1):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        first_k_dense_replace (`int`, *optional*, defaults to 1):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                            \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use query-key normalization in the attention
+    ```python
+    >>> from transformers import Glm4MoeModel, Glm4MoeConfig
+
+    >>> # Initializing a Glm4Moe style configuration
+    >>> configuration = Glm4MoeConfig()
+
+    >>> # Initializing a model from the GLM-4-MOE-100B-A10B style configuration
+    >>> model = Glm4MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Glm4Moe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "colwise",
+        "layers.*.mlp.experts.*.up_proj": "colwise",
+        "layers.*.mlp.experts.*.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=10944,
+        num_hidden_layers=46,
+        num_attention_heads=96,
+        partial_rotary_factor=0.5,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        moe_intermediate_size=1408,
+        num_experts_per_tok=8,
+        n_shared_experts=1,
+        n_routed_experts=128,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        first_k_dense_replace=1,
+        norm_topk_prob=True,
+        use_qk_norm=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.partial_rotary_factor = partial_rotary_factor
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # MoE arguments
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.use_qk_norm = use_qk_norm
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Glm4MoeConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/modeling_glm4_moe.py b/phivenv/Lib/site-packages/transformers/models/glm4_moe/modeling_glm4_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb695ffbe638c18dcc8295e3974980d21ee8d243
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4_moe/modeling_glm4_moe.py
@@ -0,0 +1,616 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4_moe/modular_glm4_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_glm4_moe import Glm4MoeConfig
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class Glm4MoeAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.rope_scaling = config.rope_scaling
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            self.q_norm = Glm4MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = Glm4MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        if self.use_qk_norm:  # main diff from Llama
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Glm4MoeMLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Glm4MoeTopkRouter(nn.Module):
+    def __init__(self, config: Glm4MoeConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+        self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts), dtype=torch.float32))
+
+    @torch.no_grad()
+    def get_topk_indices(self, scores):
+        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+        group_scores = (
+            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .topk(2, dim=-1)[0]
+            .sum(dim=-1)
+        )
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1)
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .reshape(-1, self.n_routed_experts)
+        )
+        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+        return topk_indices
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+        scores = router_logits.sigmoid()
+        topk_indices = self.get_topk_indices(scores)
+        topk_weights = scores.gather(1, topk_indices)
+        if self.norm_topk_prob:
+            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weights /= denominator
+        topk_weights = topk_weights * self.routed_scaling_factor
+        return topk_indices, topk_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Glm4MoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Glm4MoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Glm4MoeMoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList(
+            [
+                Glm4MoeMLP(config, intermediate_size=config.moe_intermediate_size)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = Glm4MoeTopkRouter(config)
+        self.shared_experts = Glm4MoeMLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+        )
+
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        r"""
+        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+        to not have to do a loop here (deepseek has 256 experts soooo yeah).
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+        # in original deepseek, the output of the experts are gathered once we leave this module
+        # thus the moe module is itelsf an IsolatedParallel module
+        # and all expert are "local" meaning we shard but we don't gather
+        return final_hidden_states.type(hidden_states.dtype)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+class Glm4MoeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Glm4MoeConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Glm4MoeAttention(config=config, layer_idx=layer_idx)
+
+        if layer_idx >= config.first_k_dense_replace:
+            self.mlp = Glm4MoeMoE(config)
+        else:
+            self.mlp = Glm4MoeMLP(config)
+
+        self.input_layernorm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class Glm4MoePreTrainedModel(PreTrainedModel):
+    config: Glm4MoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Glm4MoeDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Glm4MoeDecoderLayer,
+        "attentions": Glm4MoeAttention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Glm4MoeTopkRouter):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+class Glm4MoeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Glm4MoeConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class Glm4MoeModel(Glm4MoePreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"model\.layers\.92.*", r"model\.layers\.46.*"]
+
+    def __init__(self, config: Glm4MoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Glm4MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Glm4MoeRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Glm4MoeForCausalLM(Glm4MoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Glm4MoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Glm4MoeForCausalLM
+
+        >>> model = Glm4MoeForCausalLM.from_pretrained("meta-glm4_moe/Glm4Moe-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-glm4_moe/Glm4Moe-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["Glm4MoePreTrainedModel", "Glm4MoeModel", "Glm4MoeForCausalLM"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4_moe/modular_glm4_moe.py b/phivenv/Lib/site-packages/transformers/models/glm4_moe/modular_glm4_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc07483c7f22f6dfea4615f520839fd384be4a91
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4_moe/modular_glm4_moe.py
@@ -0,0 +1,330 @@
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GLM-4-MOE model."""
+
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+from ..cohere.modeling_cohere import CohereAttention
+from ..deepseek_v3.modeling_deepseek_v3 import (
+    DeepseekV3DecoderLayer,
+    DeepseekV3ForCausalLM,
+    DeepseekV3MLP,
+    DeepseekV3Model,
+    DeepseekV3PreTrainedModel,
+    DeepseekV3RMSNorm,
+    DeepseekV3TopkRouter,
+)
+from ..gpt_neox.modeling_gpt_neox import apply_rotary_pos_emb  # noqa
+
+
+logger = logging.get_logger(__name__)
+
+
+class Glm4MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4MoeModel`]. It is used to instantiate a
+    Glm4Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of [THUDM/GLM-4-100B-A10B](https://huggingface.co/THUDM/GLM-4-100B-A10B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151552):
+            Vocabulary size of the Glm4Moe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 10944):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 46):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 96):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            The factor of the partial rotary position.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            number of experts per token.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        n_routed_experts (`int`, *optional*, defaults to 128):
+            Number of routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        n_group (`int`, *optional*, defaults to 1):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 1):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        first_k_dense_replace (`int`, *optional*, defaults to 1):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                            \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use query-key normalization in the attention
+    ```python
+    >>> from transformers import Glm4MoeModel, Glm4MoeConfig
+
+    >>> # Initializing a Glm4Moe style configuration
+    >>> configuration = Glm4MoeConfig()
+
+    >>> # Initializing a model from the GLM-4-MOE-100B-A10B style configuration
+    >>> model = Glm4MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Glm4Moe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "colwise",
+        "layers.*.mlp.experts.*.up_proj": "colwise",
+        "layers.*.mlp.experts.*.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=10944,
+        num_hidden_layers=46,
+        num_attention_heads=96,
+        partial_rotary_factor=0.5,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        moe_intermediate_size=1408,
+        num_experts_per_tok=8,
+        n_shared_experts=1,
+        n_routed_experts=128,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        first_k_dense_replace=1,
+        norm_topk_prob=True,
+        use_qk_norm=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.partial_rotary_factor = partial_rotary_factor
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # MoE arguments
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.use_qk_norm = use_qk_norm
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class Glm4MoeAttention(CohereAttention):
+    def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
+        nn.Module.__init__(self)
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.rope_scaling = config.rope_scaling
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            self.q_norm = Glm4MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = Glm4MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+
+class Glm4MoeMLP(DeepseekV3MLP):
+    pass
+
+
+class Glm4MoeTopkRouter(DeepseekV3TopkRouter):
+    def __init__(self, config: Glm4MoeConfig):
+        nn.Module.__init__(self)
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+        self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts), dtype=torch.float32))
+
+
+class Glm4MoeRMSNorm(DeepseekV3RMSNorm):
+    pass
+
+
+class Glm4MoeDecoderLayer(DeepseekV3DecoderLayer):
+    pass
+
+
+class Glm4MoePreTrainedModel(DeepseekV3PreTrainedModel):
+    _can_compile_fullgraph = False
+
+
+class Glm4MoeModel(DeepseekV3Model):
+    _keys_to_ignore_on_load_unexpected = [r"model\.layers\.92.*", r"model\.layers\.46.*"]
+
+
+class Glm4MoeForCausalLM(DeepseekV3ForCausalLM):
+    pass
+
+
+__all__ = [
+    "Glm4MoeConfig",
+    "Glm4MoePreTrainedModel",
+    "Glm4MoeModel",
+    "Glm4MoeForCausalLM",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__init__.py b/phivenv/Lib/site-packages/transformers/models/glm4v/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4216c137fbe200d424bf9283b0da2f6e60c2817c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glm4v import *
+    from .modeling_glm4v import *
+    from .processing_glm4v import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c50fcadacaff3b9d53a2b98395f0f0f6d357163
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed1ab879513640c91016bca0469e0f85897f249d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf899106d9eedecf80a253b70839cd9d33090307
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bf1a34c2db01ba69faeb0ccc08cb681619b36e8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07206ff9001a21bd1fa61ec416e5b1c23a215761
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a7eae8d8ab1bb6e2f2fd26e88b1fb203b01b2ba
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb6cd9ca9db3ad8685eef933eaedfc982818d78
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..603002b2cdedd61b7e6053db12dc0ab30dc5992d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/configuration_glm4v.py b/phivenv/Lib/site-packages/transformers/models/glm4v/configuration_glm4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..e311cd246c8e4e2c18f1032d1b24e91302dd47fe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/configuration_glm4v.py
@@ -0,0 +1,353 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4v.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class Glm4vVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vVisionModel`]. It is used to instantiate an Glm4vVisionModel
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
+    a similar configuration to that of
+    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        depth (`int`, *optional*, defaults to 24):
+            Number of layers (depth) in the model.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"selu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the projection layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        image_size (`int` or `list[int]`, *optional*, defaults to `[336, 336]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `14`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_hidden_size (`int`, *optional*, defaults to 4096):
+            The output hidden size of the vision model.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The size used for merging spatial dimensions.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The size used for patches along the temporal dimension.
+    Example:
+
+    ```python
+    >>> from transformers import Glm4vVisionConfig, Glm4vVisionModel
+
+    >>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration
+    >>> configuration = Glm4vVisionConfig()
+
+    >>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
+    >>> model = Glm4vVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4v"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=24,
+        hidden_size=1536,
+        hidden_act="silu",
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_heads=12,
+        in_channels=3,
+        image_size=336,
+        patch_size=14,
+        rms_norm_eps=1e-05,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=4096,
+        intermediate_size=13696,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.intermediate_size = intermediate_size
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+
+class Glm4vTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
+    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151552):
+            Vocabulary size of the Glm4v model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4vModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+        image_token_id (`int`, *optional*):
+            Token index used as placeholder for image embeddings.
+        video_token_id (`int`, *optional*):
+            Token index used as placeholder for video embeddings.
+
+    ```python
+    >>> from transformers import Glm4vTextModel, Glm4vConfig
+
+    >>> # Initializing a GLM-4.1V style configuration
+    >>> configuration = Glm4vConfig()
+
+    >>> # Initializing a model from the GLM-4.1V style configuration
+    >>> model = Glm4vTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4v_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Glm4v`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=13696,
+        num_hidden_layers=40,
+        num_attention_heads=32,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        attention_dropout=0.0,
+        rope_scaling=None,
+        image_token_id=None,
+        video_token_id=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Glm4vConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
+    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151343):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151344):
+            The video token index to encode the image prompt.
+        image_start_token_id (`int`, *optional*, defaults to 151339):
+            The image start token index to encode the start of image.
+        image_end_token_id (`int`, *optional*, defaults to 151340):
+            The image end token index to encode the end of image.
+        video_start_token_id (`int`, *optional*, defaults to 151341):
+            The video start token index to encode the start of video.
+        video_end_token_id (`int`, *optional*, defaults to 151342):
+            The video end token index to encode the end of video.
+
+    ```python
+    >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
+
+    >>> # Initializing a GLM-4.1V style configuration
+    >>> configuration = Glm4vConfig()
+
+    >>> # Initializing a model from the GLM-4.1V style configuration
+    >>> model = Glm4vForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4v"
+    sub_configs = {"vision_config": Glm4vVisionConfig, "text_config": Glm4vTextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151343,
+        video_token_id=151344,
+        image_start_token_id=151339,
+        image_end_token_id=151340,
+        video_start_token_id=151341,
+        video_end_token_id=151342,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.video_start_token_id = video_start_token_id
+        self.video_end_token_id = video_end_token_id
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+
+
+__all__ = ["Glm4vConfig", "Glm4vTextConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/image_processing_glm4v.py b/phivenv/Lib/site-packages/transformers/models/glm4v/image_processing_glm4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..d669701cd9e24688a28ee825f8462fe57af685f3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/image_processing_glm4v.py
@@ -0,0 +1,472 @@
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for GLM-4.1V."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, logging
+from ...video_utils import VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+def smart_resize(
+    num_frames: int,
+    height: int,
+    width: int,
+    temporal_factor: int = 2,
+    factor: int = 28,
+    min_pixels: int = 112 * 112,
+    max_pixels: int = 14 * 14 * 2 * 2 * 2 * 6144,
+):
+    if num_frames < temporal_factor:
+        raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}")
+    if height < factor or width < factor:
+        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    t_bar = round(num_frames / temporal_factor) * temporal_factor
+
+    if t_bar * h_bar * w_bar > max_pixels:
+        beta = math.sqrt((num_frames * height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif t_bar * h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (num_frames * height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+
+    return h_bar, w_bar
+
+
+class Glm4vImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a GLM-4V image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spatial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+
+    model_input_names = ["pixel_values", "image_grid_thw"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+        elif size is None:
+            size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
+        self.size = size
+
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.do_convert_rgb = do_convert_rgb
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    num_frames=temporal_patch_size,
+                    height=height,
+                    width=width,
+                    temporal_factor=temporal_patch_size,
+                    factor=patch_size * merge_size,
+                    min_pixels=size["shortest_edge"],
+                    max_pixels=size["longest_edge"],
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] % temporal_patch_size != 0:
+            repeats = np.repeat(
+                patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0
+            )
+            patches = np.concatenate([patches, repeats], axis=0)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        patches = patches.reshape(
+            grid_t,
+            temporal_patch_size,
+            channel,
+            grid_h // merge_size,
+            merge_size,
+            patch_size,
+            grid_w // merge_size,
+            merge_size,
+            patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
+        )
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+                The max pixels of the image to resize the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+        elif size is None:
+            size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
+        merge_size = merge_size if merge_size is not None else self.merge_size
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if images is not None:
+            images = self.fetch_images(images)
+            images = make_flat_list_of_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        data = {}
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of image patches per image.
+        """
+        patch_size = images_kwargs.get("patch_size", self.patch_size)
+        merge_size = images_kwargs.get("merge_size", self.merge_size)
+        size = images_kwargs.get("size", {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000})
+
+        factor = patch_size * merge_size
+        resized_height, resized_width = smart_resize(
+            num_frames=self.temporal_patch_size,
+            height=height,
+            width=width,
+            factor=factor,
+            min_pixels=size["shortest_edge"],
+            max_pixels=size["longest_edge"],
+            temporal_factor=self.temporal_patch_size,
+        )
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        return grid_h * grid_w
+
+
+__all__ = ["Glm4vImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/image_processing_glm4v_fast.py b/phivenv/Lib/site-packages/transformers/models/glm4v/image_processing_glm4v_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d93bc5370219d318b58fc51f1a815f072c0b4a69
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/image_processing_glm4v_fast.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for GLM-4.1V."""
+
+from typing import Optional, Union
+
+from ...image_processing_utils import (
+    BatchFeature,
+)
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+from .image_processing_glm4v import smart_resize
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+logger = logging.get_logger(__name__)
+
+
+class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
+@auto_docstring
+class Glm4vImageProcessorFast(BaseImageProcessorFast):
+    do_resize = True
+    resample = PILImageResampling.BICUBIC
+    size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
+    do_rescale = True
+    do_normalize = True
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    do_convert_rgb = True
+    patch_size = 14
+    temporal_patch_size = 2
+    merge_size = 2
+    valid_kwargs = Glm4vFastImageProcessorKwargs
+    model_input_names = ["pixel_values", "image_grid_thw"]
+
+    def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+        if self.size is not None and (
+            self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
+        ):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+
+        return super()._further_process_kwargs(size=size, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        patch_size: int,
+        temporal_patch_size: int,
+        merge_size: int,
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        """
+
+        processed_images = []
+        processed_grids = []
+
+        all_target_sizes = []
+        for image in images:
+            height, width = image.shape[-2:]
+            resized_height, resized_width = smart_resize(
+                num_frames=temporal_patch_size,
+                height=height,
+                width=width,
+                temporal_factor=temporal_patch_size,
+                factor=patch_size * merge_size,
+                min_pixels=size.shortest_edge,
+                max_pixels=size.longest_edge,
+            )
+            all_target_sizes.append((resized_height, resized_width))
+
+        target_height = max([s[0] for s in all_target_sizes])
+        target_width = max([s[1] for s in all_target_sizes])
+
+        for image in images:
+            if do_resize:
+                image = self.resize(
+                    image,
+                    size=SizeDict(height=target_height, width=target_width),
+                    interpolation=interpolation,
+                )
+
+            image = self.rescale_and_normalize(
+                image.unsqueeze(0), do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            ).squeeze(0)
+
+            patches = image.unsqueeze(0)
+            if patches.shape[0] % temporal_patch_size != 0:
+                repeats = patches[-1:].repeat(temporal_patch_size - (patches.shape[0] % temporal_patch_size), 1, 1, 1)
+                patches = torch.cat([patches, repeats], dim=0)
+            channel = patches.shape[1]
+            grid_t = patches.shape[0] // temporal_patch_size
+            grid_h, grid_w = target_height // patch_size, target_width // patch_size
+            patches = patches.view(
+                grid_t,
+                temporal_patch_size,
+                channel,
+                grid_h // merge_size,
+                merge_size,
+                patch_size,
+                grid_w // merge_size,
+                merge_size,
+                patch_size,
+            )
+            patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+            flatten_patches = patches.reshape(
+                grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size,
+            )
+            processed_images.append(flatten_patches)
+            processed_grids.append([grid_t, grid_h, grid_w])
+
+        pixel_values = torch.stack(processed_images, dim=0)
+        image_grid_thw = torch.tensor(processed_grids)
+
+        return BatchFeature(
+            data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
+        )
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        **kwargs: Unpack[Glm4vFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+
+__all__ = ["Glm4vImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/modeling_glm4v.py b/phivenv/Lib/site-packages/transformers/models/glm4v/modeling_glm4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f2d574193fc674390e147aadcbed31f3b836730
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/modeling_glm4v.py
@@ -0,0 +1,1652 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4v.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import LayerNorm
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils.generic import check_model_inputs
+from .configuration_glm4v import Glm4vConfig, Glm4vTextConfig, Glm4vVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Glm4vRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Glm4vRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Glm4VisionMlp(nn.Module):
+    def __init__(self, config, bias: bool = False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.out_hidden_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class Glm4vVisionPatchEmbed(nn.Module):
+    def __init__(self, config: Glm4vVisionConfig) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+
+
+class Glm4vVisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Glm4vVisionPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None:
+        super().__init__()
+        self.proj = nn.Linear(dim, dim, bias=bias)
+        self.post_projection_norm = LayerNorm(dim)
+        self.gate_proj = nn.Linear(dim, context_dim, bias=bias)
+        self.up_proj = nn.Linear(dim, context_dim, bias=bias)
+        self.down_proj = nn.Linear(context_dim, dim, bias=bias)
+        self.act1 = nn.GELU()
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.proj(hidden_state)
+        hidden_state = self.act1(self.post_projection_norm(hidden_state))
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class Glm4vVisionEmbeddings(nn.Module):
+    def __init__(self, config: Glm4vVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
+        """
+        Forward pass with integrated position encoding adaptation using 2D interpolation.
+
+        Args:
+            embeddings: Input embeddings tensor
+            lengths (torch.Tensor): Sequence lengths for each image in the batch.
+            image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
+            h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
+            w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.
+
+        Returns:
+            torch.Tensor: Embeddings with adapted position encoding added.
+        """
+        # Get position embedding parameters
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Move coordinates to correct device
+        h_coords, w_coords = h_coords.to(device), w_coords.to(device)
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(0, hidden_size, device=device, dtype=pos_embed_weight.dtype)
+        else:
+            # Convert inputs to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths, device=device, dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(image_shapes, device=device, dtype=torch.long)
+
+            # Prepare 2D position embedding
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (
+                pos_embed_weight.view(orig_size, orig_size, hidden_size)
+                .permute(2, 0, 1)
+                .unsqueeze(0)
+                .to(device=device, dtype=torch.float32)
+            )
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat([image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+            target_w = torch.cat([image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+
+            # Normalize coordinates to [-1, 1] range for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid
+            grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2)
+
+            # Perform bicubic interpolation
+            interpolated_embed_fp32 = F.grid_sample(
+                pos_embed_2d, grid, mode="bicubic", align_corners=False, padding_mode="border"
+            )
+
+            # Reshape and convert back to original dtype
+            adapted_pos_embed_fp32 = interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0)
+            adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(embeddings.device)
+
+        # Add adapted position encoding to embeddings
+        embeddings = embeddings + adapted_pos_embed
+        return embeddings
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Glm4vVisionAttention(nn.Module):
+    def __init__(self, config: Glm4vVisionConfig) -> None:
+        super().__init__()
+        self.dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.dim // self.num_heads
+        self.num_key_value_groups = 1  # needed for eager attention
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.scaling = self.head_dim**-0.5
+        self.config = config
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states, key_states, value_states = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        if self.config._attn_implementation == "flash_attention_2":
+            # Flash Attention 2: Use cu_seqlens for variable length attention
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=None,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                cu_seq_lens_q=cu_seqlens,
+                cu_seq_lens_k=cu_seqlens,
+                max_length_q=max_seqlen,
+                max_length_k=max_seqlen,
+                is_causal=False,
+                **kwargs,
+            )
+        else:
+            # Other implementations: Process each chunk separately
+            lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+            splits = [
+                torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+            ]
+
+            attn_outputs = [
+                attention_interface(
+                    self,
+                    q,
+                    k,
+                    v,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    is_causal=False,
+                    **kwargs,
+                )[0]
+                for q, k, v in zip(*splits)
+            ]
+            attn_output = torch.cat(attn_outputs, dim=1)
+
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class Glm4vVisionBlock(GradientCheckpointingLayer):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.norm1 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm2 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attn = Glm4vVisionAttention(config)
+        self.mlp = Glm4VisionMlp(config, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class Glm4vTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Glm4vTextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        # In contrast to other models, Glm4vText has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half_llm(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+
+    Explanation:
+        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
+        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
+        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
+        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
+        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
+        difference with modern LLMs.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        mrope_section(`List(int)`):
+            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half_llm(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half_llm(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+
+    return q_embed, k_embed
+
+
+class Glm4vTextAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        self.rope_scaling = config.rope_scaling
+        self.scaling = self.head_dim**-0.5
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(  # diff with Llama
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Glm4vTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Glm4vTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Glm4vTextAttention(config, layer_idx)
+        self.mlp = Glm4vTextMLP(config)
+        self.input_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class Glm4vModelOutputWithPast(ModelOutput):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+
+
+@auto_docstring
+class Glm4vPreTrainedModel(PreTrainedModel):
+    config: Glm4vConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Glm4vTextDecoderLayer,
+        "attentions": Glm4vTextAttention,
+    }
+
+
+class Glm4vVisionModel(Glm4vPreTrainedModel):
+    config: Glm4vVisionConfig
+    _no_split_modules = ["Glm4vVisionBlock"]
+
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+
+        self.embeddings = Glm4vVisionEmbeddings(config)
+        self.patch_embed = Glm4vVisionPatchEmbed(config)
+
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([Glm4vVisionBlock(config) for _ in range(config.depth)])
+        self.merger = Glm4vVisionPatchMerger(
+            dim=config.out_hidden_size, context_dim=config.intermediate_size, hidden_act=config.hidden_act
+        )
+
+        self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.downsample = nn.Conv2d(
+            in_channels=config.hidden_size,
+            out_channels=config.out_hidden_size,
+            kernel_size=config.spatial_merge_size,
+            stride=config.spatial_merge_size,
+        )
+        self.post_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        self.post_init()
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb, pos_ids
+
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        hidden_states = self.post_conv_layernorm(hidden_states)
+
+        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
+
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                position_embeddings=position_embeddings,
+            )
+
+        hidden_states = self.post_layernorm(hidden_states)
+
+        hidden_states = hidden_states.view(
+            -1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1]
+        )
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
+
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class Glm4vTextModel(Glm4vPreTrainedModel):
+    config: Glm4vTextConfig
+
+    def __init__(self, config: Glm4vTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Glm4vTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Glm4vTextRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Glm4vModel(Glm4vPreTrainedModel):
+    base_model_prefix = ""
+    _checkpoint_conversion_mapping = {}
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Glm4vConfig
+    _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Glm4vVisionModel._from_config(config.vision_config)
+        self.language_model = Glm4vTextModel._from_config(config.text_config)
+        self.rope_deltas = None  # cache rope_deltas here
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embedding for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_start_token_id = self.config.video_start_token_id
+        video_end_token_id = self.config.video_end_token_id
+
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            video_group_index = 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                input_tokens = input_ids.tolist()
+
+                input_token_type = []
+                video_check_flg = False
+                for token in input_tokens:
+                    if token == video_start_token_id:
+                        video_check_flg = True
+                    elif token == video_end_token_id:
+                        video_check_flg = False
+
+                    if token == image_token_id and not video_check_flg:
+                        input_token_type.append("image")
+                    elif token == image_token_id and video_check_flg:
+                        input_token_type.append("video")
+                    else:
+                        input_token_type.append("text")
+
+                input_type_group = []
+                for key, group in itertools.groupby(enumerate(input_token_type), lambda x: x[1]):
+                    group = list(group)
+                    start_index = group[0][0]
+                    end_index = group[-1][0] + 1
+                    input_type_group.append((key, start_index, end_index))
+
+                llm_pos_ids_list = []
+                video_frame_num = 1
+                for modality_type, start_idx, end_idx in input_type_group:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+
+                    if modality_type == "image":
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        image_index += 1
+                        video_frame_num = 1
+
+                    elif modality_type == "video":
+                        t, h, w = (
+                            video_frame_num,
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t,
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        for t_idx in range(llm_grid_t):
+                            t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+
+                            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        video_group_index += 1
+
+                        if video_group_index >= video_grid_thw[video_index][0]:
+                            video_index += 1
+                            video_group_index = 0
+
+                        video_frame_num += 1
+
+                    else:
+                        text_len = end_idx - start_idx
+                        llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                        video_frame_num = 1
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+        # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
+        temp_frames_hw = []
+        for t, h, w in video_grid_thw:
+            repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
+            temp_frames_hw.append(repeated_row)
+        flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
+        split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        video_embeds = torch.split(video_embeds, split_sizes)
+        return video_embeds
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+        """
+        pixel_values = pixel_values.type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds
+
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: torch.FloatTensor = None,
+        video_features: torch.FloatTensor = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            # GLM-4.1V and GLM-4.5V special_video_mask is special_image_mask
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+
+        return special_image_mask, special_video_mask
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Glm4vModelOutputWithPast]:
+        r"""
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(input_ids, inputs_embeds, image_features=image_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+        if pixel_values_videos is not None:
+            video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(input_ids, inputs_embeds, video_features=video_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                # Only apply conversion for floating point tensors (inverted masks)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Glm4vModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=self.rope_deltas,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Glm4v causal language model (or autoregressive) outputs.
+    """
+)
+class Glm4vCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+
+
+class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Glm4vModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        return self.model.get_video_features(pixel_values_videos, video_grid_thw)
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        return self.model.get_image_features(pixel_values, image_grid_thw)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def visual(self):
+        return self.model.visual
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Glm4vCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration
+
+        >>> model = Glm4vForConditionalGeneration.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+        >>> processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+
+        return Glm4vCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=outputs.rope_deltas,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        # GLM-4.1V position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+
+        return model_inputs
+
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+
+        if inputs_embeds is not None:
+            is_image = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.image_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            is_video_start = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.video_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            is_video_end = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.video_end_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+        else:
+            is_image = input_ids == self.config.image_start_token_id
+            is_video_start = input_ids == self.config.video_start_token_id
+            is_video_end = input_ids == self.config.video_end_token_id
+
+        # Cumulative sum to track if we're inside a video span
+        # We'll assume well-formed video tags (i.e. matching starts and ends)
+        video_level = torch.cumsum(is_video_start.int() - is_video_end.int(), dim=1)
+        inside_video = video_level > 0  # shape (batch_size, seq_length)
+
+        # Mask out image tokens that are inside video spans
+        standalone_images = is_image & (~inside_video)
+
+        # Count per batch
+        image_counts = standalone_images.sum(dim=1)
+        video_counts = is_video_start.sum(dim=1)
+
+        return image_counts, video_counts
+
+    def _expand_inputs_for_generation(
+        self,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> tuple[torch.LongTensor, dict[str, Any]]:
+        # Overwritten -- Support for expanding tensors without a batch size dimension
+        # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
+        # pixel_values.shape[0] is sum(seqlen_images for samples)
+        # image_grid_thw.shape[0] is sum(num_images for samples)
+
+        if expand_size == 1:
+            return input_ids, model_kwargs
+
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
+
+        def _expand_dict_for_generation_visual(dict_to_expand):
+            image_grid_thw = model_kwargs.get("image_grid_thw", None)
+            video_grid_thw = model_kwargs.get("video_grid_thw", None)
+            image_nums, video_nums = self._get_image_nums_and_video_nums(
+                input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
+            )
+
+            def _repeat_interleave_samples(x, lengths, repeat_times):
+                samples = torch.split(x, lengths)
+                repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
+                return result
+
+            for key in dict_to_expand:
+                if key == "pixel_values":
+                    # split images into samples
+                    samples = torch.split(image_grid_thw, list(image_nums))
+                    # compute the sequence length of images for each sample
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "image_grid_thw":
+                    # get the num of images for each sample
+                    lengths = list(image_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "pixel_values_videos":
+                    samples = torch.split(video_grid_thw, list(video_nums))
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "video_grid_thw":
+                    lengths = list(video_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "second_per_grid_ts":
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
+                    )
+            return dict_to_expand
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and key not in visual_keys
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+
+        model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
+
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+
+        return input_ids, model_kwargs
+
+
+__all__ = ["Glm4vForConditionalGeneration", "Glm4vModel", "Glm4vPreTrainedModel", "Glm4vTextModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/modular_glm4v.py b/phivenv/Lib/site-packages/transformers/models/glm4v/modular_glm4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1bdc171320441e821272d6622355c1907bf020d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/modular_glm4v.py
@@ -0,0 +1,1682 @@
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import LayerNorm
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import ImagesKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils.generic import check_model_inputs
+from ...video_utils import VideoInput
+from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, eager_attention_forward
+from ..qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+from ..qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionPatchEmbed,
+    Qwen2_5_VisionRotaryEmbedding,
+    Qwen2_5_VLCausalLMOutputWithPast,
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2_5_VLMLP,
+    Qwen2_5_VLModel,
+    Qwen2_5_VLModelOutputWithPast,
+    Qwen2_5_VLPreTrainedModel,
+    Qwen2_5_VLRotaryEmbedding,
+    Qwen2_5_VLTextModel,
+    Qwen2_5_VLVisionAttention,
+    Qwen2_5_VLVisionBlock,
+)
+from ..qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLVideosProcessorKwargs
+from ..qwen2_vl.processing_qwen2_vl import (
+    Qwen2_VLProcessor,
+    Qwen2_VLProcessorKwargs,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class Glm4vVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vVisionModel`]. It is used to instantiate an Glm4vVisionModel
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
+    a similar configuration to that of
+    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        depth (`int`, *optional*, defaults to 24):
+            Number of layers (depth) in the model.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"selu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the projection layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        image_size (`int` or `list[int]`, *optional*, defaults to `[336, 336]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `14`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_hidden_size (`int`, *optional*, defaults to 4096):
+            The output hidden size of the vision model.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The size used for merging spatial dimensions.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The size used for patches along the temporal dimension.
+    Example:
+
+    ```python
+    >>> from transformers import Glm4vVisionConfig, Glm4vVisionModel
+
+    >>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration
+    >>> configuration = Glm4vVisionConfig()
+
+    >>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
+    >>> model = Glm4vVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4v"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=24,
+        hidden_size=1536,
+        hidden_act="silu",
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_heads=12,
+        in_channels=3,
+        image_size=336,
+        patch_size=14,
+        rms_norm_eps=1e-05,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=4096,
+        intermediate_size=13696,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.intermediate_size = intermediate_size
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+
+class Glm4vTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
+    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151552):
+            Vocabulary size of the Glm4v model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4vModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+        image_token_id (`int`, *optional*):
+            Token index used as placeholder for image embeddings.
+        video_token_id (`int`, *optional*):
+            Token index used as placeholder for video embeddings.
+
+    ```python
+    >>> from transformers import Glm4vTextModel, Glm4vConfig
+
+    >>> # Initializing a GLM-4.1V style configuration
+    >>> configuration = Glm4vConfig()
+
+    >>> # Initializing a model from the GLM-4.1V style configuration
+    >>> model = Glm4vTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4v_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Glm4v`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=13696,
+        num_hidden_layers=40,
+        num_attention_heads=32,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        attention_dropout=0.0,
+        rope_scaling=None,
+        image_token_id=None,
+        video_token_id=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Glm4vConfig(Qwen2_5_VLConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
+    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151343):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151344):
+            The video token index to encode the image prompt.
+        image_start_token_id (`int`, *optional*, defaults to 151339):
+            The image start token index to encode the start of image.
+        image_end_token_id (`int`, *optional*, defaults to 151340):
+            The image end token index to encode the end of image.
+        video_start_token_id (`int`, *optional*, defaults to 151341):
+            The video start token index to encode the start of video.
+        video_end_token_id (`int`, *optional*, defaults to 151342):
+            The video end token index to encode the end of video.
+
+    ```python
+    >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
+
+    >>> # Initializing a GLM-4.1V style configuration
+    >>> configuration = Glm4vConfig()
+
+    >>> # Initializing a model from the GLM-4.1V style configuration
+    >>> model = Glm4vForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151343,
+        video_token_id=151344,
+        image_start_token_id=151339,
+        image_end_token_id=151340,
+        video_start_token_id=151341,
+        video_end_token_id=151342,
+        **kwargs,
+    ):
+        super().__init__()
+        self.video_start_token_id = video_start_token_id
+        self.video_end_token_id = video_end_token_id
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+
+
+# Will be used for both Text and Vision modalities
+class Glm4vRMSNorm(Glm4RMSNorm):
+    pass
+
+
+class Glm4VisionMlp(Qwen2_5_VLMLP):
+    def __init__(self, config, bias: bool = False):
+        super().__init__(config, bias)
+        self.intermediate_size = config.out_hidden_size
+
+
+class Glm4vVisionPatchEmbed(Qwen2_5_VisionPatchEmbed):
+    def __init__(self, config: Glm4vVisionConfig) -> None:
+        nn.Module.__init__(self)
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size)
+
+
+class Glm4vVisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding):
+    pass
+
+
+class Glm4vVisionPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None:
+        super().__init__()
+        self.proj = nn.Linear(dim, dim, bias=bias)
+        self.post_projection_norm = LayerNorm(dim)
+        self.gate_proj = nn.Linear(dim, context_dim, bias=bias)
+        self.up_proj = nn.Linear(dim, context_dim, bias=bias)
+        self.down_proj = nn.Linear(context_dim, dim, bias=bias)
+        self.act1 = nn.GELU()
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.proj(hidden_state)
+        hidden_state = self.act1(self.post_projection_norm(hidden_state))
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class Glm4vVisionEmbeddings(nn.Module):
+    def __init__(self, config: Glm4vVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
+        """
+        Forward pass with integrated position encoding adaptation using 2D interpolation.
+
+        Args:
+            embeddings: Input embeddings tensor
+            lengths (torch.Tensor): Sequence lengths for each image in the batch.
+            image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
+            h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
+            w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.
+
+        Returns:
+            torch.Tensor: Embeddings with adapted position encoding added.
+        """
+        # Get position embedding parameters
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Move coordinates to correct device
+        h_coords, w_coords = h_coords.to(device), w_coords.to(device)
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(0, hidden_size, device=device, dtype=pos_embed_weight.dtype)
+        else:
+            # Convert inputs to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths, device=device, dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(image_shapes, device=device, dtype=torch.long)
+
+            # Prepare 2D position embedding
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (
+                pos_embed_weight.view(orig_size, orig_size, hidden_size)
+                .permute(2, 0, 1)
+                .unsqueeze(0)
+                .to(device=device, dtype=torch.float32)
+            )
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat([image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+            target_w = torch.cat([image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+
+            # Normalize coordinates to [-1, 1] range for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid
+            grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2)
+
+            # Perform bicubic interpolation
+            interpolated_embed_fp32 = F.grid_sample(
+                pos_embed_2d, grid, mode="bicubic", align_corners=False, padding_mode="border"
+            )
+
+            # Reshape and convert back to original dtype
+            adapted_pos_embed_fp32 = interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0)
+            adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(embeddings.device)
+
+        # Add adapted position encoding to embeddings
+        embeddings = embeddings + adapted_pos_embed
+        return embeddings
+
+
+class Glm4vVisionAttention(Qwen2_5_VLVisionAttention):
+    def __init__(self, config: Glm4vVisionConfig) -> None:
+        super().__init__(config)
+        self.attention_dropout = config.attention_dropout
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+
+
+class Glm4vVisionBlock(Qwen2_5_VLVisionBlock):
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.norm1 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm2 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attn = Glm4vVisionAttention(config)
+        self.mlp = Glm4VisionMlp(config, bias=False)
+
+
+class Glm4vTextRotaryEmbedding(Qwen2_5_VLRotaryEmbedding):
+    pass
+
+
+def rotate_half_llm(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+
+    Explanation:
+        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
+        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
+        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
+        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
+        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
+        difference with modern LLMs.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        mrope_section(`List(int)`):
+            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half_llm(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half_llm(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+
+    return q_embed, k_embed
+
+
+class Glm4vTextAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        self.rope_scaling = config.rope_scaling
+        self.scaling = self.head_dim**-0.5
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(  # diff with Llama
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Glm4vTextMLP(Glm4MLP):
+    pass
+
+
+class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Glm4vTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Glm4vTextAttention(config, layer_idx)
+        self.mlp = Glm4vTextMLP(config)
+        self.input_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast):
+    pass
+
+
+class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel):
+    _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
+    _can_record_outputs = {
+        "hidden_states": Glm4vTextDecoderLayer,
+        "attentions": Glm4vTextAttention,
+    }
+
+
+class Glm4vVisionModel(Glm4vPreTrainedModel):
+    config: Glm4vVisionConfig
+    _no_split_modules = ["Glm4vVisionBlock"]
+
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+
+        self.embeddings = Glm4vVisionEmbeddings(config)
+        self.patch_embed = Glm4vVisionPatchEmbed(config)
+
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([Glm4vVisionBlock(config) for _ in range(config.depth)])
+        self.merger = Glm4vVisionPatchMerger(
+            dim=config.out_hidden_size, context_dim=config.intermediate_size, hidden_act=config.hidden_act
+        )
+
+        self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.downsample = nn.Conv2d(
+            in_channels=config.hidden_size,
+            out_channels=config.out_hidden_size,
+            kernel_size=config.spatial_merge_size,
+            stride=config.spatial_merge_size,
+        )
+        self.post_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        self.post_init()
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb, pos_ids
+
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        hidden_states = self.post_conv_layernorm(hidden_states)
+
+        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
+
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                position_embeddings=position_embeddings,
+            )
+
+        hidden_states = self.post_layernorm(hidden_states)
+
+        hidden_states = hidden_states.view(
+            -1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1]
+        )
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
+
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
+
+
+class Glm4vTextModel(Qwen2_5_VLTextModel):
+    def __init__(self, config: Glm4vTextConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Glm4vTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Glm4vTextRotaryEmbedding(config=config)
+        del self._attn_implementation
+        del self.has_sliding_layers
+
+    @auto_docstring
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+class Glm4vModel(Qwen2_5_VLModel):
+    _checkpoint_conversion_mapping = {}
+    _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Glm4vVisionModel._from_config(config.vision_config)
+
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embedding for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_start_token_id = self.config.video_start_token_id
+        video_end_token_id = self.config.video_end_token_id
+
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            video_group_index = 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                input_tokens = input_ids.tolist()
+
+                input_token_type = []
+                video_check_flg = False
+                for token in input_tokens:
+                    if token == video_start_token_id:
+                        video_check_flg = True
+                    elif token == video_end_token_id:
+                        video_check_flg = False
+
+                    if token == image_token_id and not video_check_flg:
+                        input_token_type.append("image")
+                    elif token == image_token_id and video_check_flg:
+                        input_token_type.append("video")
+                    else:
+                        input_token_type.append("text")
+
+                input_type_group = []
+                for key, group in itertools.groupby(enumerate(input_token_type), lambda x: x[1]):
+                    group = list(group)
+                    start_index = group[0][0]
+                    end_index = group[-1][0] + 1
+                    input_type_group.append((key, start_index, end_index))
+
+                llm_pos_ids_list = []
+                video_frame_num = 1
+                for modality_type, start_idx, end_idx in input_type_group:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+
+                    if modality_type == "image":
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        image_index += 1
+                        video_frame_num = 1
+
+                    elif modality_type == "video":
+                        t, h, w = (
+                            video_frame_num,
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t,
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        for t_idx in range(llm_grid_t):
+                            t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+
+                            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        video_group_index += 1
+
+                        if video_group_index >= video_grid_thw[video_index][0]:
+                            video_index += 1
+                            video_group_index = 0
+
+                        video_frame_num += 1
+
+                    else:
+                        text_len = end_idx - start_idx
+                        llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                        video_frame_num = 1
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+        # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
+        temp_frames_hw = []
+        for t, h, w in video_grid_thw:
+            repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
+            temp_frames_hw.append(repeated_row)
+        flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
+        split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        video_embeds = torch.split(video_embeds, split_sizes)
+        return video_embeds
+
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: torch.FloatTensor = None,
+        video_features: torch.FloatTensor = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            # GLM-4.1V and GLM-4.5V special_video_mask is special_image_mask
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+
+        return special_image_mask, special_video_mask
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Glm4vModelOutputWithPast]:
+        r"""
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(input_ids, inputs_embeds, image_features=image_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+        if pixel_values_videos is not None:
+            video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(input_ids, inputs_embeds, video_features=video_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                # Only apply conversion for floating point tensors (inverted masks)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Glm4vModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=self.rope_deltas,
+        )
+
+
+class Glm4vCausalLMOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):
+    pass
+
+
+class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
+    _checkpoint_conversion_mapping = {}
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Glm4vCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration
+
+        >>> model = Glm4vForConditionalGeneration.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+        >>> processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+
+        return Glm4vCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=outputs.rope_deltas,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        # GLM-4.1V position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+
+        return model_inputs
+
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+
+        if inputs_embeds is not None:
+            is_image = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.image_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            is_video_start = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.video_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            is_video_end = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.video_end_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+        else:
+            is_image = input_ids == self.config.image_start_token_id
+            is_video_start = input_ids == self.config.video_start_token_id
+            is_video_end = input_ids == self.config.video_end_token_id
+
+        # Cumulative sum to track if we're inside a video span
+        # We'll assume well-formed video tags (i.e. matching starts and ends)
+        video_level = torch.cumsum(is_video_start.int() - is_video_end.int(), dim=1)
+        inside_video = video_level > 0  # shape (batch_size, seq_length)
+
+        # Mask out image tokens that are inside video spans
+        standalone_images = is_image & (~inside_video)
+
+        # Count per batch
+        image_counts = standalone_images.sum(dim=1)
+        video_counts = is_video_start.sum(dim=1)
+
+        return image_counts, video_counts
+
+
+class Glm4vVideosProcessorKwargs(Qwen2_5_VLVideosProcessorKwargs):
+    pass
+
+
+class Glm4vImagesKwargs(ImagesKwargs):
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
+class Glm4vProcessorKwargs(Qwen2_VLProcessorKwargs):
+    images_kwargs: Glm4vImagesKwargs
+    videos_kwargs: Glm4vVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": False,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+
+
+class Glm4vProcessor(Qwen2_VLProcessor):
+    r"""
+    Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
+    [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Glm4vProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`Glm4vVideoProcessor`], *optional*):
+            The video processor is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Glm4vProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Glm4vProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+            video_grid_thw = videos_inputs["video_grid_thw"]
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+
+        if video_grid_thw is not None:
+            merge_length = self.video_processor.merge_size**2
+            video_index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    num_frames = video_grid_thw[video_index][0]
+                    video_structure = ""
+
+                    metadata = video_metadata[i]
+                    if metadata.fps is None:
+                        logger.warning_once(
+                            "SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
+                            "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
+                            "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
+                        )
+                    metadata.fps = 24 if metadata.fps is None else metadata.fps
+                    timestamps = metadata.timestamps[::2]  # mrope
+
+                    unique_timestamps = []
+                    for idx in range(0, len(timestamps)):
+                        unique_timestamps.append(timestamps[idx])
+
+                    selected_timestamps = unique_timestamps[:num_frames]
+                    while len(selected_timestamps) < num_frames:
+                        selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
+
+                    for frame_idx in range(num_frames):
+                        timestamp_sec = selected_timestamps[frame_idx]
+                        frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{int(timestamp_sec)}"
+                        video_structure += frame_structure
+
+                    text[i] = text[i].replace(self.video_token, video_structure, 1)
+                    num_image_tokens = (
+                        video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
+                    )
+                    for frame_idx in range(num_frames):
+                        if self.image_token in text[i]:
+                            text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+
+                    video_index += 1
+
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
+
+
+__all__ = [
+    "Glm4vConfig",
+    "Glm4vTextConfig",
+    "Glm4vForConditionalGeneration",
+    "Glm4vModel",
+    "Glm4vPreTrainedModel",
+    "Glm4vProcessor",
+    "Glm4vTextModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/processing_glm4v.py b/phivenv/Lib/site-packages/transformers/models/glm4v/processing_glm4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cec2c8972587cc41dee3cd947e59f2613497900
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/processing_glm4v.py
@@ -0,0 +1,294 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4v.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ...video_utils import VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
+    fps: Union[list[float], float]
+
+
+class Glm4vImagesKwargs(ImagesKwargs):
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
+class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Glm4vImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": False,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+    videos_kwargs: Glm4vVideosProcessorKwargs
+
+
+class Glm4vProcessor(ProcessorMixin):
+    r"""
+    Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
+    [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Glm4vProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`Glm4vVideoProcessor`], *optional*):
+            The video processor is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+
+    tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Glm4vProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Glm4vProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+            video_grid_thw = videos_inputs["video_grid_thw"]
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+
+        if video_grid_thw is not None:
+            merge_length = self.video_processor.merge_size**2
+            video_index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    num_frames = video_grid_thw[video_index][0]
+                    video_structure = ""
+
+                    metadata = video_metadata[i]
+                    if metadata.fps is None:
+                        logger.warning_once(
+                            "SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
+                            "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
+                            "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
+                        )
+                    metadata.fps = 24 if metadata.fps is None else metadata.fps
+                    timestamps = metadata.timestamps[::2]  # mrope
+
+                    unique_timestamps = []
+                    for idx in range(0, len(timestamps)):
+                        unique_timestamps.append(timestamps[idx])
+
+                    selected_timestamps = unique_timestamps[:num_frames]
+                    while len(selected_timestamps) < num_frames:
+                        selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
+
+                    for frame_idx in range(num_frames):
+                        timestamp_sec = selected_timestamps[frame_idx]
+                        frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{int(timestamp_sec)}"
+                        video_structure += frame_structure
+
+                    text[i] = text[i].replace(self.video_token, video_structure, 1)
+                    num_image_tokens = (
+                        video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0]
+                    )
+                    for frame_idx in range(num_frames):
+                        if self.image_token in text[i]:
+                            text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+
+                    video_index += 1
+
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+            video_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (num_frames, height, width) per each video.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = Glm4vProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
+
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        if video_sizes is not None:
+            videos_kwargs = Glm4vProcessorKwargs._defaults.get("videos_kwargs", {})
+            videos_kwargs.update(kwargs)
+            num_video_patches = [
+                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
+                for video_size in video_sizes
+            ]
+            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
+            vision_data["num_video_tokens"] = num_video_tokens
+
+        return MultiModalData(**vision_data)
+
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+
+__all__ = ["Glm4vProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v/video_processing_glm4v.py b/phivenv/Lib/site-packages/transformers/models/glm4v/video_processing_glm4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..a327ac200507dda3d086252f651b0b57e3c7c289
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v/video_processing_glm4v.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""video processor class for GLM-4.1V."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import (
+    BatchFeature,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    SizeDict,
+    get_image_size,
+)
+from ...processing_utils import Unpack, VideosKwargs
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_vision_available,
+)
+from .image_processing_glm4v import smart_resize
+
+
+if is_torch_available():
+    import torch
+
+from ...utils.import_utils import requires
+from ...video_processing_utils import (
+    BASE_VIDEO_PROCESSOR_DOCSTRING,
+    BaseVideoProcessor,
+)
+from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
+
+
+if is_vision_available():
+    from ...image_utils import PILImageResampling
+
+
+class Glm4vVideoProcessorInitKwargs(VideosKwargs):
+    max_image_size: dict[str, int] = None
+    patch_size: Optional[int] = None
+    temporal_patch_size: Optional[int] = None
+    merge_size: Optional[int] = None
+    image_mean: Optional[list[float]] = None
+    image_std: Optional[list[float]] = None
+
+
+@add_start_docstrings(
+    "Constructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.",
+    BASE_VIDEO_PROCESSOR_DOCSTRING,
+    """
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """,
+)
+@requires(backends=("torchvision",))
+class Glm4vVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 2 * 30000}
+    max_image_size = {"longest_edge": 28 * 28 * 2 * 30000}
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    do_sample_frames = True
+    patch_size = 14
+    temporal_patch_size = 2
+    max_duration = 300
+    merge_size = 2
+    valid_kwargs = Glm4vVideoProcessorInitKwargs
+    num_frames = 16
+    fps = 2
+
+    model_input_names = ["pixel_values_videos", "video_grid_thw"]
+
+    def __init__(self, **kwargs: Unpack[Glm4vVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+        if self.size is not None and (
+            self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
+        ):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+
+        return super()._further_process_kwargs(size=size, **kwargs)
+
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        fps: Optional[Union[int, float]] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            fps (`int` or `float`, *optional*):
+                Target frames to sample per second. Defaults to `self.fps`.
+        Returns:
+            np.ndarray:
+                Indices to sample video frames.
+        """
+        if metadata is None or getattr(metadata, "fps", None) is None:
+            raise ValueError(
+                "Asked to sample frames per second but no video metadata was provided which is required when sampling in GLM4V. "
+                "Please pass in `VideoMetadata` object or set `do_sample_frames=False`"
+            )
+
+        total_frames = metadata.total_num_frames
+        requested_fps = fps if fps is not None else self.fps
+
+        max_frame_idx = total_frames - 1
+        duration = metadata.duration or round(max_frame_idx / metadata.fps) + 1
+
+        if duration <= self.max_duration:
+            n = int(math.floor(duration * requested_fps))
+            frame_indices = [min(max_frame_idx, int(math.ceil(i * metadata.fps / requested_fps))) for i in range(n)]
+        else:
+            num_samples = int(self.max_duration * requested_fps)
+            if num_samples >= total_frames:
+                frame_indices = list(range(total_frames))
+            else:
+                target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
+                frame_indices = [min(max_frame_idx, int(math.ceil(t * metadata.fps))) for t in target_seconds]
+
+        seen, uniq = set(), []
+        for idx in frame_indices:
+            if idx not in seen:
+                seen.add(idx)
+                uniq.append(idx)
+
+        if len(uniq) & 1:
+            uniq.append(uniq[-1])
+
+        return np.array(uniq)
+
+    def _preprocess(
+        self,
+        videos: list[torch.Tensor],
+        do_convert_rgb: bool = True,
+        do_resize: bool = True,
+        size: Optional[SizeDict] = None,
+        interpolation: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255.0,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ):
+        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
+        resized_videos_grouped = {}
+
+        for shape, stacked_videos in grouped_videos.items():
+            B, T, C, H, W = stacked_videos.shape
+            num_frames, height, width = T, H, W
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    num_frames=num_frames,
+                    height=height,
+                    width=width,
+                    temporal_factor=temporal_patch_size,
+                    factor=patch_size * merge_size,
+                    min_pixels=size.shortest_edge,
+                    max_pixels=size.longest_edge,
+                )
+                stacked_videos = stacked_videos.view(B * T, C, H, W)
+                stacked_videos = self.resize(
+                    stacked_videos,
+                    size=SizeDict(height=resized_height, width=resized_width),
+                    interpolation=interpolation,
+                )
+                stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
+            resized_videos_grouped[shape] = stacked_videos
+        resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
+
+        # Group videos by size for further processing
+        # Needed in case do_resize is False, or resize returns videos with different sizes
+        grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
+        processed_videos_grouped = {}
+        processed_grids = {}
+        for shape, stacked_videos in grouped_videos.items():
+            resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
+
+            # Fused rescale and normalize
+            stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            patches = stacked_videos
+
+            # Check that videos have `num_frames` divisible by `temporal_patch_size`
+            if patches.shape[1] % temporal_patch_size != 0:
+                repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
+                patches = torch.cat([patches, repeats], dim=1)
+            batch_size, grid_t, channel = patches.shape[:3]
+            grid_t = grid_t // temporal_patch_size
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+
+            patches = patches.view(
+                batch_size,
+                grid_t,
+                temporal_patch_size,
+                channel,
+                grid_h // merge_size,
+                merge_size,
+                patch_size,
+                grid_w // merge_size,
+                merge_size,
+                patch_size,
+            )
+            patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
+            flatten_patches = patches.reshape(
+                batch_size,
+                grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size,
+            )
+
+            processed_videos_grouped[shape] = flatten_patches
+            processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
+
+        processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
+        processed_grids = reorder_videos(processed_grids, grouped_videos_index)
+        pixel_values_videos = torch.cat(processed_videos, dim=0)
+        video_grid_thw = torch.tensor(processed_grids)
+        data = {
+            "pixel_values_videos": pixel_values_videos,
+            "video_grid_thw": video_grid_thw,
+        }
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Glm4vVideoProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__init__.py b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99578a4be721ecdc5bcbd157fe75f8f16384086
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glm4v_moe import *
+    from .modeling_glm4v_moe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88582c4cd9a6739b6c77f67eb05dcff13e0eb87e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6d1e30046356cbe63f783ea8d1d562036b07595
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52d9efec70e028e225885de585e86d95e4024979
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd993889e2f0b9963c3471051bb3fd67bfee14a9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/configuration_glm4v_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..52004b560da764a293c70b8021de1d167d8e9cc8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/configuration_glm4v_moe.py
@@ -0,0 +1,385 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4v_moe/modular_glm4v_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4v_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class Glm4vMoeVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vMoeVisionModel`]. It is used to instantiate an Glm4vMoeVisionModel
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
+    a similar configuration to that of
+    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        depth (`int`, *optional*, defaults to 24):
+            Number of layers (depth) in the model.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        intermediate_size (`int`, *optional*, defaults to 13696):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"selu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the projection layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        image_size (`int` or `list[int]`, *optional*, defaults to `[336, 336]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `14`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_hidden_size (`int`, *optional*, defaults to 4096):
+            The output hidden size of the vision model.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The size used for merging spatial dimensions.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The size used for patches along the temporal dimension.
+    Example:
+
+    ```python
+    >>> from transformers import Glm4vMoeVisionConfig, Glm4vMoeVisionModel
+
+    >>> # Initializing a Glm4vMoeVisionConfig GLM-4.1V-9B style configuration
+    >>> configuration = Glm4vMoeVisionConfig()
+
+    >>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
+    >>> model = Glm4vMoeVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4v_moe"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=24,
+        hidden_size=1536,
+        hidden_act="silu",
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_heads=12,
+        in_channels=3,
+        image_size=336,
+        patch_size=14,
+        rms_norm_eps=1e-05,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=4096,
+        intermediate_size=13696,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.intermediate_size = intermediate_size
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+
+class Glm4vMoeTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
+    GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.5V [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151424):
+            Vocabulary size of the Glm4vMoe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4vMoeModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 10944):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 46):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 96):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 65536):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+        attention_bias (`bool`, defaults to `True`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            number of experts per token.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        n_routed_experts (`int`, *optional*, defaults to 128):
+            Number of routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        n_group (`int`, *optional*, defaults to 1):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 1):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        first_k_dense_replace (`int`, *optional*, defaults to 1):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                                    \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+
+    ```python
+    >>> from transformers import Glm4vMoeTextModel, Glm4vMoeConfig
+
+    >>> # Initializing a GLM-4.5V style configuration
+    >>> configuration = Glm4vMoeConfig()
+
+    >>> # Initializing a model from the GLM-4.5V style configuration
+    >>> model = Glm4vMoeTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "Glm4vMoe_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Glm4vMoe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=151424,
+        hidden_size=4096,
+        intermediate_size=10944,
+        num_hidden_layers=46,
+        num_attention_heads=96,
+        partial_rotary_factor=0.5,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=65536,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=True,
+        attention_dropout=0.0,
+        moe_intermediate_size=1408,
+        num_experts_per_tok=8,
+        n_shared_experts=1,
+        n_routed_experts=128,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        first_k_dense_replace=1,
+        norm_topk_prob=True,
+        **kwargs,
+    ):
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.partial_rotary_factor = partial_rotary_factor
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+
+        # MoE arguments
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+
+
+class Glm4vMoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
+    GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.5V [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vMoeTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vMoeVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151363):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151364):
+            The video token index to encode the image prompt.
+        image_start_token_id (`int`, *optional*, defaults to 151339):
+            The image start token index to encode the start of image.
+        image_end_token_id (`int`, *optional*, defaults to 151340):
+            The image end token index to encode the end of image.
+        video_start_token_id (`int`, *optional*, defaults to 151341):
+            The video start token index to encode the start of video.
+        video_end_token_id (`int`, *optional*, defaults to 151342):
+            The video end token index to encode the end of video.
+
+    ```python
+    >>> from transformers import Glm4vMoeForConditionalGeneration, Glm4vMoeConfig
+
+    >>> # Initializing a GLM-4.5V style configuration
+    >>> configuration = Glm4vMoeConfig()
+
+    >>> # Initializing a model from the GLM-4.5V style configuration
+    >>> model = Glm4vMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glm4v_moe"
+    sub_configs = {"vision_config": Glm4vMoeVisionConfig, "text_config": Glm4vMoeTextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151363,
+        video_token_id=151364,
+        image_start_token_id=151339,
+        image_end_token_id=151340,
+        video_start_token_id=151341,
+        video_end_token_id=151342,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.video_start_token_id = video_start_token_id
+        self.video_end_token_id = video_end_token_id
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+
+
+__all__ = ["Glm4vMoeConfig", "Glm4vMoeTextConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/modeling_glm4v_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..1511ca34833b97d8977fff97a97d54815195673e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/modeling_glm4v_moe.py
@@ -0,0 +1,1768 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/glm4v_moe/modular_glm4v_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_glm4v_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import LayerNorm
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_glm4v_moe import Glm4vMoeConfig, Glm4vMoeTextConfig, Glm4vMoeVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Glm4vMoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Glm4vMoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+
+    Explanation:
+        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
+        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
+        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
+        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
+        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
+        difference with modern LLMs.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        mrope_section(`List(int)`):
+            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+
+    return q_embed, k_embed
+
+
+class Glm4vMoeTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Glm4vMoeTextConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.rope_scaling = config.rope_scaling
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(  # diff with Llama
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Glm4vMoeTextTopkRouter(nn.Module):
+    def __init__(self, config: Glm4vMoeTextConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+        self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts), dtype=torch.float32))
+
+    @torch.no_grad()
+    def get_topk_indices(self, scores):
+        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+        group_scores = (
+            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .topk(2, dim=-1)[0]
+            .sum(dim=-1)
+        )
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1)
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .reshape(-1, self.n_routed_experts)
+        )
+        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+        return topk_indices
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+        scores = router_logits.sigmoid()
+        topk_indices = self.get_topk_indices(scores)
+        topk_weights = scores.gather(1, topk_indices)
+        if self.norm_topk_prob:
+            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weights /= denominator
+        topk_weights = topk_weights * self.routed_scaling_factor
+        return topk_indices, topk_weights
+
+
+class Glm4vMoeTextMoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config: Glm4vMoeTextConfig):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList(
+            [
+                Glm4vMoeTextMLP(config, intermediate_size=config.moe_intermediate_size)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = Glm4vMoeTextTopkRouter(config)
+        self.shared_experts = Glm4vMoeTextMLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+        )
+
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        r"""
+        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+        to not have to do a loop here (deepseek has 256 experts soooo yeah).
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+        # in original deepseek, the output of the experts are gathered once we leave this module
+        # thus the moe module is itelsf an IsolatedParallel module
+        # and all expert are "local" meaning we shard but we don't gather
+        return final_hidden_states.type(hidden_states.dtype)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+class Glm4vMoeTextMLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Glm4vMoeTextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Glm4vMoeTextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Glm4vMoeTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Glm4vMoeTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Glm4vMoeTextAttention(config=config, layer_idx=layer_idx)
+
+        if layer_idx >= config.first_k_dense_replace:
+            self.mlp = Glm4vMoeTextMoE(config)
+        else:
+            self.mlp = Glm4vMoeTextMLP(config)
+
+        self.input_layernorm = Glm4vMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Glm4vMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class Glm4vMoePreTrainedModel(PreTrainedModel):
+    config: Glm4vMoeConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": Glm4vMoeTextDecoderLayer,
+        "attentions": Glm4vMoeTextAttention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Glm4vMoeTextTopkRouter):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+class Glm4vMoeisionMlp(nn.Module):
+    def __init__(self, config, bias: bool = False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.out_hidden_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class Glm4vMoeVisionPatchEmbed(nn.Module):
+    def __init__(self, config: Glm4vMoeVisionConfig) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+
+
+class Glm4vMoeVisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Glm4vMoeVisionPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None:
+        super().__init__()
+        self.proj = nn.Linear(dim, dim, bias=bias)
+        self.post_projection_norm = LayerNorm(dim)
+        self.gate_proj = nn.Linear(dim, context_dim, bias=bias)
+        self.up_proj = nn.Linear(dim, context_dim, bias=bias)
+        self.down_proj = nn.Linear(context_dim, dim, bias=bias)
+        self.act1 = nn.GELU()
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.proj(hidden_state)
+        hidden_state = self.act1(self.post_projection_norm(hidden_state))
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class Glm4vMoeVisionEmbeddings(nn.Module):
+    def __init__(self, config: Glm4vMoeVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
+        """
+        Forward pass with integrated position encoding adaptation using 2D interpolation.
+
+        Args:
+            embeddings: Input embeddings tensor
+            lengths (torch.Tensor): Sequence lengths for each image in the batch.
+            image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
+            h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
+            w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.
+
+        Returns:
+            torch.Tensor: Embeddings with adapted position encoding added.
+        """
+        # Get position embedding parameters
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Move coordinates to correct device
+        h_coords, w_coords = h_coords.to(device), w_coords.to(device)
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(0, hidden_size, device=device, dtype=pos_embed_weight.dtype)
+        else:
+            # Convert inputs to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths, device=device, dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(image_shapes, device=device, dtype=torch.long)
+
+            # Prepare 2D position embedding
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (
+                pos_embed_weight.view(orig_size, orig_size, hidden_size)
+                .permute(2, 0, 1)
+                .unsqueeze(0)
+                .to(device=device, dtype=torch.float32)
+            )
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat([image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+            target_w = torch.cat([image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]).to(
+                device=device, dtype=torch.float32
+            )
+
+            # Normalize coordinates to [-1, 1] range for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid
+            grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2)
+
+            # Perform bicubic interpolation
+            interpolated_embed_fp32 = F.grid_sample(
+                pos_embed_2d, grid, mode="bicubic", align_corners=False, padding_mode="border"
+            )
+
+            # Reshape and convert back to original dtype
+            adapted_pos_embed_fp32 = interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0)
+            adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(embeddings.device)
+
+        # Add adapted position encoding to embeddings
+        embeddings = embeddings + adapted_pos_embed
+        return embeddings
+
+
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+
+
+class Glm4vMoeVisionAttention(nn.Module):
+    def __init__(self, config: Glm4vMoeVisionConfig) -> None:
+        super().__init__()
+        self.dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.dim // self.num_heads
+        self.num_key_value_groups = 1  # needed for eager attention
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.scaling = self.head_dim**-0.5
+        self.config = config
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states, key_states, value_states = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        if self.config._attn_implementation == "flash_attention_2":
+            # Flash Attention 2: Use cu_seqlens for variable length attention
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=None,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                cu_seq_lens_q=cu_seqlens,
+                cu_seq_lens_k=cu_seqlens,
+                max_length_q=max_seqlen,
+                max_length_k=max_seqlen,
+                is_causal=False,
+                **kwargs,
+            )
+        else:
+            # Other implementations: Process each chunk separately
+            lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+            splits = [
+                torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+            ]
+
+            attn_outputs = [
+                attention_interface(
+                    self,
+                    q,
+                    k,
+                    v,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    is_causal=False,
+                    **kwargs,
+                )[0]
+                for q, k, v in zip(*splits)
+            ]
+            attn_output = torch.cat(attn_outputs, dim=1)
+
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class Glm4vMoeVisionBlock(GradientCheckpointingLayer):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.norm1 = Glm4vMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm2 = Glm4vMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attn = Glm4vMoeVisionAttention(config)
+        self.mlp = Glm4vMoeisionMlp(config, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class Glm4vMoeTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Glm4vMoeTextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        # In contrast to other models, Glm4vMoeText has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class Glm4vMoeModelOutputWithPast(ModelOutput):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+
+
+class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
+    config: Glm4vMoeVisionConfig
+    _no_split_modules = ["Glm4vMoeVisionBlock"]
+
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+
+        self.embeddings = Glm4vMoeVisionEmbeddings(config)
+        self.patch_embed = Glm4vMoeVisionPatchEmbed(config)
+
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Glm4vMoeVisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([Glm4vMoeVisionBlock(config) for _ in range(config.depth)])
+        self.merger = Glm4vMoeVisionPatchMerger(
+            dim=config.out_hidden_size, context_dim=config.intermediate_size, hidden_act=config.hidden_act
+        )
+
+        self.post_conv_layernorm = Glm4vMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.downsample = nn.Conv2d(
+            in_channels=config.hidden_size,
+            out_channels=config.out_hidden_size,
+            kernel_size=config.spatial_merge_size,
+            stride=config.spatial_merge_size,
+        )
+        self.post_layernorm = Glm4vMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        self.post_init()
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb, pos_ids
+
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        hidden_states = self.post_conv_layernorm(hidden_states)
+
+        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
+
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                position_embeddings=position_embeddings,
+            )
+
+        hidden_states = self.post_layernorm(hidden_states)
+
+        hidden_states = hidden_states.view(
+            -1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1]
+        )
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
+
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class Glm4vMoeTextModel(Glm4vMoePreTrainedModel):
+    config: Glm4vMoeTextConfig
+
+    def __init__(self, config: Glm4vMoeTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Glm4vMoeTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Glm4vMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Glm4vMoeTextRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Glm4vMoeModel(Glm4vMoePreTrainedModel):
+    base_model_prefix = ""
+    _checkpoint_conversion_mapping = {}
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Glm4vMoeConfig
+    _no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Glm4vMoeVisionModel._from_config(config.vision_config)
+        self.language_model = Glm4vMoeTextModel._from_config(config.text_config)
+        self.rope_deltas = None  # cache rope_deltas here
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embedding for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_start_token_id = self.config.video_start_token_id
+        video_end_token_id = self.config.video_end_token_id
+
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            video_group_index = 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                input_tokens = input_ids.tolist()
+
+                input_token_type = []
+                video_check_flg = False
+                for token in input_tokens:
+                    if token == video_start_token_id:
+                        video_check_flg = True
+                    elif token == video_end_token_id:
+                        video_check_flg = False
+
+                    if token == image_token_id and not video_check_flg:
+                        input_token_type.append("image")
+                    elif token == image_token_id and video_check_flg:
+                        input_token_type.append("video")
+                    else:
+                        input_token_type.append("text")
+
+                input_type_group = []
+                for key, group in itertools.groupby(enumerate(input_token_type), lambda x: x[1]):
+                    group = list(group)
+                    start_index = group[0][0]
+                    end_index = group[-1][0] + 1
+                    input_type_group.append((key, start_index, end_index))
+
+                llm_pos_ids_list = []
+                video_frame_num = 1
+                for modality_type, start_idx, end_idx in input_type_group:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+
+                    if modality_type == "image":
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        image_index += 1
+                        video_frame_num = 1
+
+                    elif modality_type == "video":
+                        t, h, w = (
+                            video_frame_num,
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t,
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        for t_idx in range(llm_grid_t):
+                            t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+
+                            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        video_group_index += 1
+
+                        if video_group_index >= video_grid_thw[video_index][0]:
+                            video_index += 1
+                            video_group_index = 0
+
+                        video_frame_num += 1
+
+                    else:
+                        text_len = end_idx - start_idx
+                        llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                        video_frame_num = 1
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+        # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
+        temp_frames_hw = []
+        for t, h, w in video_grid_thw:
+            repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
+            temp_frames_hw.append(repeated_row)
+        flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw)
+        split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        video_embeds = torch.split(video_embeds, split_sizes)
+        return video_embeds
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+        """
+        pixel_values = pixel_values.type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds
+
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: torch.FloatTensor = None,
+        video_features: torch.FloatTensor = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            # GLM-4.1V and GLM-4.5V special_video_mask is special_image_mask
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+
+        return special_image_mask, special_video_mask
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Glm4vMoeModelOutputWithPast]:
+        r"""
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(input_ids, inputs_embeds, image_features=image_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+        if pixel_values_videos is not None:
+            video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(input_ids, inputs_embeds, video_features=video_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                # Only apply conversion for floating point tensors (inverted masks)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Glm4vMoeModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=self.rope_deltas,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Glm4vMoe causal language model (or autoregressive) outputs.
+    """
+)
+class Glm4vMoeCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+
+
+class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Glm4vMoeModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        return self.model.get_video_features(pixel_values_videos, video_grid_thw)
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        return self.model.get_image_features(pixel_values, image_grid_thw)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def visual(self):
+        return self.model.visual
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Glm4vMoeCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Glm4vMoeForConditionalGeneration
+
+        >>> model = Glm4vMoeForConditionalGeneration.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+        >>> processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+
+        return Glm4vMoeCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=outputs.rope_deltas,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        # GLM-4.1V position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+
+        return model_inputs
+
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+
+        if inputs_embeds is not None:
+            is_image = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.image_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            is_video_start = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.video_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            is_video_end = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(self.config.video_end_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+        else:
+            is_image = input_ids == self.config.image_start_token_id
+            is_video_start = input_ids == self.config.video_start_token_id
+            is_video_end = input_ids == self.config.video_end_token_id
+
+        # Cumulative sum to track if we're inside a video span
+        # We'll assume well-formed video tags (i.e. matching starts and ends)
+        video_level = torch.cumsum(is_video_start.int() - is_video_end.int(), dim=1)
+        inside_video = video_level > 0  # shape (batch_size, seq_length)
+
+        # Mask out image tokens that are inside video spans
+        standalone_images = is_image & (~inside_video)
+
+        # Count per batch
+        image_counts = standalone_images.sum(dim=1)
+        video_counts = is_video_start.sum(dim=1)
+
+        return image_counts, video_counts
+
+    def _expand_inputs_for_generation(
+        self,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> tuple[torch.LongTensor, dict[str, Any]]:
+        # Overwritten -- Support for expanding tensors without a batch size dimension
+        # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
+        # pixel_values.shape[0] is sum(seqlen_images for samples)
+        # image_grid_thw.shape[0] is sum(num_images for samples)
+
+        if expand_size == 1:
+            return input_ids, model_kwargs
+
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
+
+        def _expand_dict_for_generation_visual(dict_to_expand):
+            image_grid_thw = model_kwargs.get("image_grid_thw", None)
+            video_grid_thw = model_kwargs.get("video_grid_thw", None)
+            image_nums, video_nums = self._get_image_nums_and_video_nums(
+                input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
+            )
+
+            def _repeat_interleave_samples(x, lengths, repeat_times):
+                samples = torch.split(x, lengths)
+                repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
+                return result
+
+            for key in dict_to_expand:
+                if key == "pixel_values":
+                    # split images into samples
+                    samples = torch.split(image_grid_thw, list(image_nums))
+                    # compute the sequence length of images for each sample
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "image_grid_thw":
+                    # get the num of images for each sample
+                    lengths = list(image_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "pixel_values_videos":
+                    samples = torch.split(video_grid_thw, list(video_nums))
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "video_grid_thw":
+                    lengths = list(video_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "second_per_grid_ts":
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
+                    )
+            return dict_to_expand
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and key not in visual_keys
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+
+        model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
+
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+
+        return input_ids, model_kwargs
+
+
+__all__ = ["Glm4vMoeForConditionalGeneration", "Glm4vMoeModel", "Glm4vMoePreTrainedModel", "Glm4vMoeTextModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glm4v_moe/modular_glm4v_moe.py b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/modular_glm4v_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dfe28ff19da878a689afb0ab6621e8cfb35f340
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glm4v_moe/modular_glm4v_moe.py
@@ -0,0 +1,459 @@
+# coding=utf-8
+# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache
+from ...configuration_utils import PretrainedConfig
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..glm4.modeling_glm4 import Glm4Attention
+from ..glm4_moe.configuration_glm4_moe import Glm4MoeConfig
+from ..glm4_moe.modeling_glm4_moe import (
+    Glm4MoeDecoderLayer,
+    Glm4MoeMLP,
+    Glm4MoeMoE,
+    Glm4MoePreTrainedModel,
+    Glm4MoeRMSNorm,
+    Glm4MoeTopkRouter,
+    eager_attention_forward,
+)
+from ..glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
+from ..glm4v.modeling_glm4v import (
+    Glm4vForConditionalGeneration,
+    rotate_half,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class Glm4vMoeVisionConfig(Glm4vVisionConfig):
+    pass
+
+
+class Glm4vMoeTextConfig(Glm4MoeConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
+    GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.5V [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151424):
+            Vocabulary size of the Glm4vMoe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Glm4vMoeModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 10944):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 46):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 96):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 65536):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+        attention_bias (`bool`, defaults to `True`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            number of experts per token.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        n_routed_experts (`int`, *optional*, defaults to 128):
+            Number of routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        n_group (`int`, *optional*, defaults to 1):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 1):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        first_k_dense_replace (`int`, *optional*, defaults to 1):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                                    \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+
+    ```python
+    >>> from transformers import Glm4vMoeTextModel, Glm4vMoeConfig
+
+    >>> # Initializing a GLM-4.5V style configuration
+    >>> configuration = Glm4vMoeConfig()
+
+    >>> # Initializing a model from the GLM-4.5V style configuration
+    >>> model = Glm4vMoeTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "Glm4vMoe_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Glm4vMoe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151424,
+        hidden_size=4096,
+        intermediate_size=10944,
+        num_hidden_layers=46,
+        num_attention_heads=96,
+        partial_rotary_factor=0.5,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=65536,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=True,
+        attention_dropout=0.0,
+        moe_intermediate_size=1408,
+        num_experts_per_tok=8,
+        n_shared_experts=1,
+        n_routed_experts=128,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        first_k_dense_replace=1,
+        norm_topk_prob=True,
+        **kwargs,
+    ):
+        PretrainedConfig.__init__(self, tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.partial_rotary_factor = partial_rotary_factor
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+
+        # MoE arguments
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+
+
+class Glm4vMoeConfig(Glm4vConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
+    GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of
+    GLM-4.5V [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vMoeTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vMoeVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151363):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151364):
+            The video token index to encode the image prompt.
+        image_start_token_id (`int`, *optional*, defaults to 151339):
+            The image start token index to encode the start of image.
+        image_end_token_id (`int`, *optional*, defaults to 151340):
+            The image end token index to encode the end of image.
+        video_start_token_id (`int`, *optional*, defaults to 151341):
+            The video start token index to encode the start of video.
+        video_end_token_id (`int`, *optional*, defaults to 151342):
+            The video end token index to encode the end of video.
+
+    ```python
+    >>> from transformers import Glm4vMoeForConditionalGeneration, Glm4vMoeConfig
+
+    >>> # Initializing a GLM-4.5V style configuration
+    >>> configuration = Glm4vMoeConfig()
+
+    >>> # Initializing a model from the GLM-4.5V style configuration
+    >>> model = Glm4vMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151363,
+        video_token_id=151364,
+        image_start_token_id=151339,
+        image_end_token_id=151340,
+        video_start_token_id=151341,
+        video_end_token_id=151342,
+        **kwargs,
+    ):
+        super().__init__()
+
+
+class Glm4vMoeRMSNorm(Glm4MoeRMSNorm):
+    pass
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+
+    Explanation:
+        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
+        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
+        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
+        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
+        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
+        difference with modern LLMs.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        mrope_section(`List(int)`):
+            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+
+    return q_embed, k_embed
+
+
+class Glm4vMoeTextAttention(Glm4Attention):
+    def __init__(self, config: Glm4vMoeTextConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.rope_scaling = config.rope_scaling
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(  # diff with Llama
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Glm4vMoeTextTopkRouter(Glm4MoeTopkRouter, nn.Module):
+    def __init__(self, config: Glm4vMoeTextConfig):
+        super().__init__(config)
+
+
+class Glm4vMoeTextMoE(Glm4MoeMoE):
+    def __init__(self, config: Glm4vMoeTextConfig):
+        super().__init__(config)
+        self.config = config
+        self.experts = nn.ModuleList(
+            [
+                Glm4vMoeTextMLP(config, intermediate_size=config.moe_intermediate_size)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = Glm4vMoeTextTopkRouter(config)
+        self.shared_experts = Glm4vMoeTextMLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+        )
+
+
+class Glm4vMoeTextMLP(Glm4MoeMLP):
+    pass
+
+
+class Glm4vMoeTextDecoderLayer(Glm4MoeDecoderLayer):
+    def __init__(self, config: Glm4vMoeTextConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+
+class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel):
+    config: Glm4vMoeConfig
+    base_model_prefix = ""
+    _no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+
+    _can_record_outputs = {
+        "hidden_states": Glm4vMoeTextDecoderLayer,
+        "attentions": Glm4vMoeTextAttention,
+    }
+
+
+class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
+    pass
+
+
+__all__ = [
+    "Glm4vMoeConfig",
+    "Glm4vMoeTextConfig",
+    "Glm4vMoeForConditionalGeneration",
+    "Glm4vMoeModel",  # noqa: F822
+    "Glm4vMoePreTrainedModel",
+    "Glm4vMoeTextModel",  # noqa: F822
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/__init__.py b/phivenv/Lib/site-packages/transformers/models/glpn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a5b38675c34780fd7554db92ea870121f31fd76
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glpn/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_glpn import *
+    from .feature_extraction_glpn import *
+    from .image_processing_glpn import *
+    from .modeling_glpn import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a657dbe396ea8ceabc2d2cf8eed6d51602ce2d4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/configuration_glpn.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/configuration_glpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be3bd3d05f5bf48eb13afdbf83d5b7a97d2045d0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/configuration_glpn.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/feature_extraction_glpn.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/feature_extraction_glpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b39e61efb57d392004d5c725bd444bb0f663641
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/feature_extraction_glpn.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/image_processing_glpn.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/image_processing_glpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b48d96523dd8e50743cf528c68e09955cbaf24cb
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/image_processing_glpn.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/modeling_glpn.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/modeling_glpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c78de3084943a89ab06f89adbf69aeaee22cb84
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/glpn/__pycache__/modeling_glpn.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/configuration_glpn.py b/phivenv/Lib/site-packages/transformers/models/glpn/configuration_glpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fb35bb0b08cdedde15e3afc05ef2d8e3f2dbb5b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glpn/configuration_glpn.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2022 KAIST and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLPN model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GLPNConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GLPNModel`]. It is used to instantiate an GLPN
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the GLPN
+    [vinvino02/glpn-kitti](https://huggingface.co/vinvino02/glpn-kitti) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_encoder_blocks (`int`, *optional*, defaults to 4):
+            The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
+        depths (`list[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
+            The number of layers in each encoder block.
+        sr_ratios (`list[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
+            Sequence reduction ratios in each encoder block.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[32, 64, 160, 256]`):
+            Dimension of each of the encoder blocks.
+        patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
+            Patch size before each encoder block.
+        strides (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Stride before each encoder block.
+        num_attention_heads (`list[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
+            Number of attention heads for each attention layer in each block of the Transformer encoder.
+        mlp_ratios (`list[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        decoder_hidden_size (`int`, *optional*, defaults to 64):
+            The dimension of the decoder.
+        max_depth (`int`, *optional*, defaults to 10):
+            The maximum depth of the decoder.
+        head_in_index (`int`, *optional*, defaults to -1):
+            The index of the features to use in the head.
+
+    Example:
+
+    ```python
+    >>> from transformers import GLPNModel, GLPNConfig
+
+    >>> # Initializing a GLPN vinvino02/glpn-kitti style configuration
+    >>> configuration = GLPNConfig()
+
+    >>> # Initializing a model from the vinvino02/glpn-kitti style configuration
+    >>> model = GLPNModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "glpn"
+
+    def __init__(
+        self,
+        num_channels=3,
+        num_encoder_blocks=4,
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        hidden_sizes=[32, 64, 160, 256],
+        patch_sizes=[7, 3, 3, 3],
+        strides=[4, 2, 2, 2],
+        num_attention_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        drop_path_rate=0.1,
+        layer_norm_eps=1e-6,
+        decoder_hidden_size=64,
+        max_depth=10,
+        head_in_index=-1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.depths = depths
+        self.sr_ratios = sr_ratios
+        self.hidden_sizes = hidden_sizes
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.mlp_ratios = mlp_ratios
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.drop_path_rate = drop_path_rate
+        self.layer_norm_eps = layer_norm_eps
+        self.decoder_hidden_size = decoder_hidden_size
+        self.max_depth = max_depth
+        self.head_in_index = head_in_index
+
+
+__all__ = ["GLPNConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/feature_extraction_glpn.py b/phivenv/Lib/site-packages/transformers/models/glpn/feature_extraction_glpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..327fee4a11fd308f980845a971a9fc8335decaf1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glpn/feature_extraction_glpn.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for GLPN."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_glpn import GLPNImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class GLPNFeatureExtractor(GLPNImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class GLPNFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use GLPNImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["GLPNFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/image_processing_glpn.py b/phivenv/Lib/site-packages/transformers/models/glpn/image_processing_glpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..958230a307f3a03be5509d8d6522dd4e97b06c4d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glpn/image_processing_glpn.py
@@ -0,0 +1,276 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for GLPN."""
+
+from typing import TYPE_CHECKING, Optional, Union
+
+from ...utils.import_utils import requires
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
+
+import numpy as np
+import PIL.Image
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class GLPNImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a GLPN image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions, rounding them down to the closest multiple of
+            `size_divisor`. Can be overridden by `do_resize` in `preprocess`.
+        size_divisor (`int`, *optional*, defaults to 32):
+            When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest
+            multiple of `size_divisor`. Can be overridden by `size_divisor` in `preprocess`.
+        resample (`PIL.Image` resampling filter, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be
+            overridden by `do_rescale` in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size_divisor: int = 32,
+        resample=PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        **kwargs,
+    ) -> None:
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.size_divisor = size_divisor
+        self.resample = resample
+        super().__init__(**kwargs)
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size_divisor: int,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
+
+        If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160).
+
+        Args:
+            image (`np.ndarray`):
+                The image to resize.
+            size_divisor (`int`):
+                The image is resized so its height and width are rounded down to the closest multiple of
+                `size_divisor`.
+            resample:
+                `PIL.Image` resampling filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If `None`, the channel dimension format of the input
+                image is used. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not set, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        height, width = get_image_size(image, channel_dim=input_data_format)
+        # Rounds the height and width down to the closest multiple of size_divisor
+        new_h = height // size_divisor * size_divisor
+        new_w = width // size_divisor * size_divisor
+        image = resize(
+            image,
+            (new_h, new_w),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: Union["PIL.Image.Image", TensorType, list["PIL.Image.Image"], list[TensorType]],
+        do_resize: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
+        resample=None,
+        do_rescale: Optional[bool] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Preprocess the given images.
+
+        Args:
+            images (`PIL.Image.Image` or `TensorType` or `list[np.ndarray]` or `list[TensorType]`):
+                Images to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_normalize=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`.
+            size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
+                When `do_resize` is `True`, images are resized so their height and width are rounded down to the
+                closest multiple of `size_divisor`.
+            resample (`PIL.Image` resampling filter, *optional*, defaults to `self.resample`):
+                `PIL.Image` resampling filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - `None`: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # Here, the rescale() method uses a constant rescale_factor. It does not need to be validated
+        # with a rescale_factor.
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size_divisor,  # Here, size_divisor is used as a parameter for optimal resizing instead of size.
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(img) for img in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [self.rescale(image, scale=1 / 255, input_data_format=input_data_format) for image in images]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
+    ) -> list[dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                depth = depth[None, None, ...]
+                depth = torch.nn.functional.interpolate(depth, size=target_size, mode="bicubic", align_corners=False)
+                depth = depth.squeeze()
+
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["GLPNImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/glpn/modeling_glpn.py b/phivenv/Lib/site-packages/transformers/models/glpn/modeling_glpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e7b9b2654dbbcc338ff7edee7c752ae2a18815
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/glpn/modeling_glpn.py
@@ -0,0 +1,725 @@
+# coding=utf-8
+# Copyright 2022 KAIST and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GLPN model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, logging
+from .configuration_glpn import GLPNConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerDropPath
+class GLPNDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerOverlapPatchEmbeddings
+class GLPNOverlapPatchEmbeddings(nn.Module):
+    """Construct the overlapping patch embeddings."""
+
+    def __init__(self, patch_size, stride, num_channels, hidden_size):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            num_channels,
+            hidden_size,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2,
+        )
+
+        self.layer_norm = nn.LayerNorm(hidden_size)
+
+    def forward(self, pixel_values):
+        embeddings = self.proj(pixel_values)
+        _, _, height, width = embeddings.shape
+        # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels)
+        # this can be fed to a Transformer layer
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerEfficientSelfAttention
+class GLPNEfficientSelfAttention(nn.Module):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://huggingface.co/papers/2102.12122)."""
+
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(self.hidden_size, self.all_head_size)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.sr_ratio = sequence_reduction_ratio
+        if sequence_reduction_ratio > 1:
+            self.sr = nn.Conv2d(
+                hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
+            )
+            self.layer_norm = nn.LayerNorm(hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        height,
+        width,
+        output_attentions=False,
+    ):
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        if self.sr_ratio > 1:
+            batch_size, seq_len, num_channels = hidden_states.shape
+            # Reshape to (batch_size, num_channels, height, width)
+            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            # Apply sequence reduction
+            hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
+            hidden_states = self.layer_norm(hidden_states)
+
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerSelfOutput
+class GLPNSelfOutput(nn.Module):
+    def __init__(self, config, hidden_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerAttention with Segformer->GLPN
+class GLPNAttention(nn.Module):
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
+        super().__init__()
+        self.self = GLPNEfficientSelfAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+        )
+        self.output = GLPNSelfOutput(config, hidden_size=hidden_size)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, height, width, output_attentions=False):
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerDWConv
+class GLPNDWConv(nn.Module):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, hidden_states, height, width):
+        batch_size, seq_len, num_channels = hidden_states.shape
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
+        hidden_states = self.dwconv(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        return hidden_states
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerMixFFN with Segformer->GLPN
+class GLPNMixFFN(nn.Module):
+    def __init__(self, config, in_features, hidden_features=None, out_features=None):
+        super().__init__()
+        out_features = out_features or in_features
+        self.dense1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = GLPNDWConv(hidden_features)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = nn.Linear(hidden_features, out_features)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, height, width):
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.dwconv(hidden_states, height, width)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.segformer.modeling_segformer.SegformerLayer with Segformer->GLPN
+class GLPNLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
+        super().__init__()
+        self.layer_norm_1 = nn.LayerNorm(hidden_size)
+        self.attention = GLPNAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+        )
+        self.drop_path = GLPNDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.layer_norm_2 = nn.LayerNorm(hidden_size)
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = GLPNMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
+
+    def forward(self, hidden_states, height, width, output_attentions=False):
+        self_attention_outputs = self.attention(
+            self.layer_norm_1(hidden_states),  # in GLPN, layernorm is applied before self-attention
+            height,
+            width,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection (with stochastic depth)
+        attention_output = self.drop_path(attention_output)
+        hidden_states = attention_output + hidden_states
+
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
+
+        # second residual connection (with stochastic depth)
+        mlp_output = self.drop_path(mlp_output)
+        layer_output = mlp_output + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class GLPNEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+
+        # patch embeddings
+        embeddings = []
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                GLPNOverlapPatchEmbeddings(
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                )
+            )
+        self.patch_embeddings = nn.ModuleList(embeddings)
+
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    GLPNLayer(
+                        config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        drop_path=dpr[cur + j],
+                        sequence_reduction_ratio=config.sr_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                    )
+                )
+            blocks.append(nn.ModuleList(layers))
+
+        self.block = nn.ModuleList(blocks)
+
+        # Layer norms
+        self.layer_norm = nn.ModuleList(
+            [nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)]
+        )
+
+    def forward(
+        self,
+        pixel_values,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        batch_size = pixel_values.shape[0]
+
+        hidden_states = pixel_values
+        for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)):
+            embedding_layer, block_layer, norm_layer = x
+            # first, obtain patch embeddings
+            hidden_states, height, width = embedding_layer(hidden_states)
+            # second, send embeddings through blocks
+            for i, blk in enumerate(block_layer):
+                layer_outputs = blk(hidden_states, height, width, output_attentions)
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
+            # third, apply layer norm
+            hidden_states = norm_layer(hidden_states)
+            # fourth, optionally reshape back to (batch_size, num_channels, height, width)
+            hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@auto_docstring
+class GLPNPreTrainedModel(PreTrainedModel):
+    config: GLPNConfig
+    base_model_prefix = "glpn"
+    main_input_name = "pixel_values"
+    _no_split_modules = []
+
+    # Copied from transformers.models.segformer.modeling_segformer.SegformerPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class GLPNModel(GLPNPreTrainedModel):
+    # Copied from transformers.models.segformer.modeling_segformer.SegformerModel.__init__ with Segformer->GLPN
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        # hierarchical Transformer encoder
+        self.encoder = GLPNEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    # Copied from transformers.models.segformer.modeling_segformer.SegformerModel.forward
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class GLPNSelectiveFeatureFusion(nn.Module):
+    """
+    Selective Feature Fusion module, as explained in the [paper](https://huggingface.co/papers/2201.07436) (section 3.4). This
+    module adaptively selects and integrates local and global features by attaining an attention map for each feature.
+    """
+
+    def __init__(self, in_channel=64):
+        super().__init__()
+
+        self.convolutional_layer1 = nn.Sequential(
+            nn.Conv2d(in_channels=int(in_channel * 2), out_channels=in_channel, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(in_channel),
+            nn.ReLU(),
+        )
+
+        self.convolutional_layer2 = nn.Sequential(
+            nn.Conv2d(in_channels=in_channel, out_channels=int(in_channel / 2), kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(int(in_channel / 2)),
+            nn.ReLU(),
+        )
+
+        self.convolutional_layer3 = nn.Conv2d(
+            in_channels=int(in_channel / 2), out_channels=2, kernel_size=3, stride=1, padding=1
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, local_features, global_features):
+        # concatenate features along the channel dimension
+        features = torch.cat((local_features, global_features), dim=1)
+        # pass through convolutional layers
+        features = self.convolutional_layer1(features)
+        features = self.convolutional_layer2(features)
+        features = self.convolutional_layer3(features)
+        # apply sigmoid to get two-channel attention map
+        attn = self.sigmoid(features)
+        # construct hybrid features by adding element-wise
+        hybrid_features = local_features * attn[:, 0, :, :].unsqueeze(1) + global_features * attn[
+            :, 1, :, :
+        ].unsqueeze(1)
+
+        return hybrid_features
+
+
+class GLPNDecoderStage(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        should_skip = in_channels == out_channels
+        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1) if not should_skip else nn.Identity()
+        self.fusion = GLPNSelectiveFeatureFusion(out_channels)
+        self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+
+    def forward(self, hidden_state, residual=None):
+        hidden_state = self.convolution(hidden_state)
+        if residual is not None:
+            hidden_state = self.fusion(hidden_state, residual)
+        hidden_state = self.upsample(hidden_state)
+
+        return hidden_state
+
+        hidden_state = self.upsample(hidden_state)
+        return hidden_state
+
+
+class GLPNDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # we use features from end -> start
+        reserved_hidden_sizes = config.hidden_sizes[::-1]
+        out_channels = config.decoder_hidden_size
+
+        self.stages = nn.ModuleList(
+            [GLPNDecoderStage(hidden_size, out_channels) for hidden_size in reserved_hidden_sizes]
+        )
+        # don't fuse in first stage
+        self.stages[0].fusion = None
+
+        self.final_upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+
+    def forward(self, hidden_states: list[torch.Tensor]) -> list[torch.Tensor]:
+        stage_hidden_states = []
+        stage_hidden_state = None
+        for hidden_state, stage in zip(hidden_states[::-1], self.stages):
+            stage_hidden_state = stage(hidden_state, stage_hidden_state)
+            stage_hidden_states.append(stage_hidden_state)
+
+        stage_hidden_states[-1] = self.final_upsample(stage_hidden_state)
+
+        return stage_hidden_states
+
+
+class SiLogLoss(nn.Module):
+    r"""
+    Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://huggingface.co/papers/1406.2283).
+
+    $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
+    y_{i}^{*}$.
+
+    """
+
+    def __init__(self, lambd=0.5):
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, pred, target):
+        valid_mask = (target > 0).detach()
+        diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
+        loss = torch.sqrt(torch.pow(diff_log, 2).mean() - self.lambd * torch.pow(diff_log.mean(), 2))
+
+        return loss
+
+
+class GLPNDepthEstimationHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        channels = config.decoder_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(channels, 1, kernel_size=3, stride=1, padding=1),
+        )
+
+    def forward(self, hidden_states: list[torch.Tensor]) -> torch.Tensor:
+        # use last features of the decoder
+        hidden_states = hidden_states[self.config.head_in_index]
+
+        hidden_states = self.head(hidden_states)
+
+        predicted_depth = torch.sigmoid(hidden_states) * self.config.max_depth
+        predicted_depth = predicted_depth.squeeze(dim=1)
+
+        return predicted_depth
+
+
+@auto_docstring(
+    custom_intro="""
+    GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
+    """
+)
+class GLPNForDepthEstimation(GLPNPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.glpn = GLPNModel(config)
+        self.decoder = GLPNDecoder(config)
+        self.head = GLPNDepthEstimationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
+        r"""
+        labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
+        >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # interpolate to original size
+        >>> post_processed_output = image_processor.post_process_depth_estimation(
+        ...     outputs,
+        ...     target_sizes=[(image.height, image.width)],
+        ... )
+
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.glpn(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        out = self.decoder(hidden_states)
+        predicted_depth = self.head(out)
+
+        loss = None
+        if labels is not None:
+            loss_fct = SiLogLoss()
+            loss = loss_fct(predicted_depth, labels)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (predicted_depth,) + outputs[1:]
+            else:
+                output = (predicted_depth,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["GLPNForDepthEstimation", "GLPNLayer", "GLPNModel", "GLPNPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__init__.py b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b6ccc53fc0efb0fc88c2f95586276cd40010fe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_got_ocr2 import *
+    from .image_processing_got_ocr2 import *
+    from .image_processing_got_ocr2_fast import *
+    from .modeling_got_ocr2 import *
+    from .processing_got_ocr2 import *
+
+
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17dbf1e0162364e4998a910ac2bc07ce949e9c9a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5ceb19597312327c4eeda24f5db704f66ad584f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d6ad35fca212ddc446617b886e51d58cbd52b53
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67d77533075888a7949e57b4f620bbc0164bc2f8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e36e51031c926d6c3887c9ecab243c690474b67
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca2d4cf8cace5ddcc13e63ab75df139b1bd5ffde
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2804b1edb0ef97875ebd032abd5190caef239186
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/configuration_got_ocr2.py b/phivenv/Lib/site-packages/transformers/models/got_ocr2/configuration_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb039f958950c3fbc3cd4401b425348181b13a00
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/got_ocr2/configuration_got_ocr2.py
@@ -0,0 +1,211 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/got_ocr2/modular_got_ocr2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_got_ocr2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class GotOcr2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GotOcr2VisionModel`]. It is used to instantiate a GOT_OCR2
+    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    defaults will yield a similar configuration to that of the SAM ViT-h
+    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        output_channels (`int`, *optional*, defaults to 256):
+            Dimensionality of the output channels in the Patch Encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        image_size (`int`, *optional*, defaults to 1024):
+            Expected resolution. Target size of the resized input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of the patches to be extracted from the input image.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string)
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to query, key, value projections.
+        use_abs_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use absolute position embedding.
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use relative position embedding.
+        window_size (`int`, *optional*, defaults to 14):
+            Window size for relative position.
+        global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
+            The indexes of the global attention layers.
+        mlp_dim (`int`, *optional*, defaults to 3072):
+            The dimensionality of the MLP layer in the Transformer encoder.
+    """
+
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        output_channels=256,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        mlp_dim=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.output_channels = output_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.qkv_bias = qkv_bias
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.mlp_dim = mlp_dim
+
+
+class GotOcr2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GotOcr2ForConditionalGeneration`]. It is used to instantiate a
+    GotOcr2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of GOT-OCR-2.0.
+
+    e.g [stepfun-ai/GOT-OCR-2.0-hf](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 151859):
+            The image token index to encode the image prompt.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        pad_token_id (`int`, *optional*, defaults to -1):
+            Padding token id.
+
+    ```python
+    >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config
+
+    >>> # Initializing a GotOcr2 style configuration
+    >>> configuration = GotOcr2Config()
+
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = GotOcr2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "got_ocr2"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": GotOcr2VisionConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=151859,
+        image_seq_length=576,
+        pad_token_id=-1,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.image_seq_length = image_seq_length
+        self.pad_token_id = pad_token_id
+
+        if vision_config is None:
+            self.vision_config = GotOcr2VisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = GotOcr2VisionConfig(**vision_config)
+        elif isinstance(vision_config, GotOcr2VisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "qwen2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"](
+                vocab_size=151860,
+                hidden_size=1024,
+                intermediate_size=2816,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                num_key_value_heads=16,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                tie_word_embeddings=True,
+                rope_theta=1000000.0,
+                rope_scaling=None,
+                use_sliding_window=False,
+                sliding_window=4096,
+                max_window_layers=21,
+                attention_dropout=0.0,
+            )
+
+        self.text_config = text_config
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["GotOcr2VisionConfig", "GotOcr2Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/image_processing_got_ocr2.py b/phivenv/Lib/site-packages/transformers/models/got_ocr2/image_processing_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1a48fa6cf7b9951edcc2023a1a9af6bdc23de28
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/got_ocr2/image_processing_got_ocr2.py
@@ -0,0 +1,526 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Got-OCR-2."""
+
+from functools import lru_cache
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+# Similar to image_processing_mllama.get_all_supported_aspect_ratios
+@lru_cache(maxsize=10)
+def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]:
+    """
+    Computes all allowed aspect ratios for a given minimum and maximum number of input tiles.
+
+    This function calculates all possible arrangements of tiles that can be formed
+    within the constraint of the minimum and maximum number of tiles. Each arrangement is
+    represented by its aspect ratio (width/height) and the corresponding tile configuration.
+
+    Args:
+        min_image_tiles (`int`):
+            The minimum number of tiles allowed.
+        max_image_tiles (`int`):
+            The maximum number of tiles allowed.
+
+    Returns:
+        `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
+        configuration in terms of number of tiles.
+
+    Example:
+        >>> get_all_supported_aspect_ratios(1, 4)
+        [(1, 1), (1, 2), (2, 1), (1, 3), (3, 1), (1, 4), (2, 2), (4, 1)]
+
+    """
+    aspect_ratios = []
+    for width in range(1, max_image_tiles + 1):
+        for height in range(1, max_image_tiles + 1):
+            if width * height <= max_image_tiles and width * height >= min_image_tiles:
+                aspect_ratios.append((width, height))
+
+    aspect_ratios = sorted(aspect_ratios, key=lambda x: x[0] * x[1])
+
+    return aspect_ratios
+
+
+@lru_cache(maxsize=100)
+def get_optimal_tiled_canvas(
+    original_image_size: tuple[int, int],
+    target_tile_size: tuple[int, int],
+    min_image_tiles: int,
+    max_image_tiles: int,
+) -> tuple[int, int]:
+    """
+    Given a minimum and maximum number of tiles, find the canvas with the closest aspect ratio to the
+    original image aspect ratio.
+    In case of tie-breaking condition when two canvases have the same aspect ratio difference, we favor the canvas with
+    more tiles, until the area covered by the tiles is more than twice the target area, in order to avoid unnecessarily
+    excessive tiling.
+    """
+    possible_tile_arrangements = get_all_supported_aspect_ratios(min_image_tiles, max_image_tiles)
+
+    original_height, original_width = original_image_size
+    target_tile_height, target_tile_width = target_tile_size
+    aspect_ratio = original_width / original_height
+    area = original_width * original_height
+
+    # find the grid with the best aspect ratio
+    best_ratio_diff = float("inf")
+    best_grid = (1, 1)
+    for grid in possible_tile_arrangements:
+        grid_aspect_ratio = grid[0] / grid[1]
+        ratio_diff = abs(aspect_ratio - grid_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_grid = grid
+        elif ratio_diff == best_ratio_diff:
+            # if the aspect ratio difference is the same, we favor the grid with more patches
+            # until the area covered by the patches is more than twice the original image area
+            if area > 0.5 * target_tile_height * target_tile_width * grid[0] * grid[1]:
+                best_grid = grid
+
+    return best_grid
+
+
+class GotOcr2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a GOT_OCR2 image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        crop_to_patches (`bool`, *optional*, defaults to `False`):
+            Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+            `preprocess` method.
+        min_patches (`int`, *optional*, defaults to 1):
+            The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+            set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+        max_patches (`int`, *optional*, defaults to 12):
+            The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+            set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        crop_to_patches: bool = False,
+        min_patches: int = 1,
+        max_patches: int = 12,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.crop_to_patches = crop_to_patches
+        self.min_patches = min_patches
+        self.max_patches = max_patches
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        crop_to_patches: Optional[bool] = None,
+        min_patches: Optional[int] = None,
+        max_patches: Optional[int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            crop_to_patches (`bool`, *optional*, defaults to `self.crop_to_patches`):
+                Whether to crop the image to patches.
+            min_patches (`int`, *optional*, defaults to `self.min_patches`):
+                The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+                set to `True`.
+            max_patches (`int`, *optional*, defaults to `self.max_patches`):
+                The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+                set to `True`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        crop_to_patches = crop_to_patches if crop_to_patches is not None else self.crop_to_patches
+        min_patches = min_patches if min_patches is not None else self.min_patches
+        max_patches = max_patches if max_patches is not None else self.max_patches
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if crop_to_patches and max_patches > 1:
+            images = [
+                self.crop_image_to_patches(
+                    image,
+                    min_patches=min_patches,
+                    max_patches=max_patches,
+                    patch_size=size,
+                    data_format=input_data_format,
+                )
+                for image in images
+            ]
+            num_patches = np.array([len(image) for image in images])
+            images = [image for images_list in images for image in images_list]
+        else:
+            num_patches = np.array([1] * len(images))
+
+        for i, image in enumerate(images):
+            if do_resize:
+                images[i] = self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_rescale:
+                images[i] = self.rescale(image=images[i], scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                images[i] = self.normalize(
+                    image=images[i],
+                    mean=image_mean,
+                    std=image_std,
+                    input_data_format=input_data_format,
+                )
+
+            images[i] = to_channel_dimension_format(images[i], data_format, input_channel_dim=input_data_format)
+
+        encoded_outputs = BatchFeature(
+            data={"pixel_values": images, "num_patches": num_patches}, tensor_type=return_tensors
+        )
+
+        return encoded_outputs
+
+    def crop_image_to_patches(
+        self,
+        images: np.ndarray,
+        min_patches: int,
+        max_patches: int,
+        use_thumbnail: bool = True,
+        patch_size: Optional[Union[tuple, int, dict]] = None,
+        data_format: ChannelDimension = None,
+    ):
+        """
+        Crop the image to patches and return a list of cropped images.
+        The number of patches and their grid arrangement are determined by the original image size,
+        the target patch size and the minimum and maximum number of patches.
+        The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
+
+        Args:
+            images (`np.ndarray`):
+                The image to be cropped.
+            min_patches (`int`):
+                The minimum number of patches to be extracted from the image.
+            max_patches (`int`):
+                The maximum number of patches to be extracted from the image.
+            use_thumbnail (`bool`, *optional*, defaults to `True`):
+                Whether to add a thumbnail image to the list of cropped patches.
+            patch_size (`int`, `tuple[int, int]`, `dict`, *optional*):
+                The size of the output patches.
+            data_format (`ChannelDimension`, *optional*):
+                The format of the image data. If `None`, the format is inferred from the input image.
+
+        Returns:
+            list[`PIL.Image.Image`] or list[np.ndarray]: The list of cropped images.
+        """
+        if data_format is None:
+            data_format = infer_channel_dimension_format(images)
+        images = to_channel_dimension_format(images, ChannelDimension.FIRST, data_format)
+        patch_size_height, patch_size_width = patch_size["height"], patch_size["width"]
+        original_height, original_width = images.shape[-2:]
+        # find the closest aspect ratio to the target
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
+        )
+
+        # calculate the target width and height
+        target_width = patch_size_width * num_columns
+        target_height = patch_size_height * num_rows
+        num_blocks = num_columns * num_rows
+
+        # resize the image so that each patch is of patch_size
+        resized_image = self.resize(
+            images,
+            {"height": target_height, "width": target_width},
+            data_format=ChannelDimension.FIRST,
+            input_data_format=ChannelDimension.FIRST,
+        )
+        # split the image into patches
+        processed_images = []
+        for i in range(num_blocks):
+            column = i % num_columns
+            row = i // num_columns
+            box = (
+                column * patch_size_width,
+                row * patch_size_height,
+                (column + 1) * patch_size_width,
+                (row + 1) * patch_size_height,
+            )
+            # split the image
+            patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]]
+            patch_image = to_channel_dimension_format(patch_image, data_format, ChannelDimension.FIRST)
+            processed_images.append(patch_image)
+
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = self.resize(
+                images, patch_size, data_format=data_format, input_data_format=ChannelDimension.FIRST
+            )
+            processed_images.append(thumbnail_img)
+
+        return processed_images
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        min_patches = images_kwargs.get("min_patches", self.min_patches)
+        max_patches = images_kwargs.get("max_patches", self.max_patches)
+        patch_size = images_kwargs.get("patch_size", self.size)
+        crop_to_patches = images_kwargs.get("crop_to_patches", self.crop_to_patches)
+
+        num_patches = 1
+        if crop_to_patches and max_patches > 1:
+            num_columns, num_rows = get_optimal_tiled_canvas(
+                (height, width), (patch_size["height"], patch_size["width"]), min_patches, max_patches
+            )
+            if num_columns * num_rows > 1:
+                num_patches += num_columns * num_rows
+
+        return num_patches
+
+
+__all__ = ["GotOcr2ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/phivenv/Lib/site-packages/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..6652e018263cdc6dae9f81b190dd56771bbdbcd3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
@@ -0,0 +1,256 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Got-OCR-2."""
+
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+from .image_processing_got_ocr2 import get_optimal_tiled_canvas
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+@auto_docstring
+class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    crop_to_patches = False
+    min_patches = 1
+    max_patches = 12
+    valid_kwargs = GotOcr2FastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def crop_image_to_patches(
+        self,
+        images: "torch.Tensor",
+        min_patches: int,
+        max_patches: int,
+        use_thumbnail: bool = True,
+        patch_size: Optional[Union[tuple, int, dict]] = None,
+        interpolation: Optional["F.InterpolationMode"] = None,
+    ):
+        """
+        Crop the images to patches and return a list of cropped images.
+        The number of patches and their grid arrangement are determined by the original image size,
+        the target patch size and the minimum and maximum number of patches.
+        The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to be cropped.
+            min_patches (`int`):
+                The minimum number of patches to be extracted from the image.
+            max_patches (`int`):
+                The maximum number of patches to be extracted from the image.
+            use_thumbnail (`bool`, *optional*, defaults to `True`):
+                Whether to add a thumbnail image to the list of cropped patches.
+            patch_size (`int`, `tuple[int, int]`, `dict`, *optional*):
+                The size of the output patches.
+                The format of the image data. If `None`, the format is inferred from the input image.
+
+        Returns:
+            list[`PIL.Image.Image`] or list[np.ndarray]: The list of cropped images.
+        """
+        patch_size_height, patch_size_width = patch_size.height, patch_size.width
+        original_height, original_width = images.shape[-2:]
+        # find the closest aspect ratio to the target
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
+        )
+
+        # calculate the target width and height
+        target_width = patch_size_width * num_columns
+        target_height = patch_size_height * num_rows
+        num_blocks = num_columns * num_rows
+
+        # resize the image so that each patch is of patch_size
+        resized_image = self.resize(
+            images, SizeDict(height=target_height, width=target_width), interpolation=interpolation
+        )
+        # split the image into patches
+        processed_images = []
+        for i in range(num_blocks):
+            column = i % num_columns
+            row = i // num_columns
+            box = (
+                column * patch_size_width,
+                row * patch_size_height,
+                (column + 1) * patch_size_width,
+                (row + 1) * patch_size_height,
+            )
+            # split the image
+            patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]]
+            processed_images.append(patch_image)
+
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = self.resize(images, patch_size, interpolation=interpolation)
+            processed_images.append(thumbnail_img)
+
+        processed_images = torch.stack(processed_images, dim=0).transpose(0, 1).contiguous()
+
+        return processed_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        crop_to_patches: bool,
+        min_patches: int,
+        max_patches: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        if crop_to_patches:
+            grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+            processed_images_grouped = {}
+            num_patches = {}
+            for shape, stacked_images in grouped_images.items():
+                stacked_images = self.crop_image_to_patches(
+                    stacked_images,
+                    min_patches,
+                    max_patches,
+                    patch_size=size,
+                    interpolation=interpolation,
+                )
+                processed_images_grouped[shape] = stacked_images
+                num_patches[shape] = [stacked_images.shape[1]] * stacked_images.shape[0]
+            images = reorder_images(processed_images_grouped, grouped_images_index)
+            images = [image for images_list in images for image in images_list]
+            num_patches = reorder_images(num_patches, grouped_images_index)
+        else:
+            num_patches = [1] * len(images)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
+        )
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        min_patches = images_kwargs.get("min_patches", self.min_patches)
+        max_patches = images_kwargs.get("max_patches", self.max_patches)
+        patch_size = images_kwargs.get("patch_size", self.size)
+        crop_to_patches = images_kwargs.get("crop_to_patches", self.crop_to_patches)
+
+        num_patches = 1
+        if crop_to_patches and max_patches > 1:
+            num_columns, num_rows = get_optimal_tiled_canvas(
+                (height, width), (patch_size["height"], patch_size["width"]), min_patches, max_patches
+            )
+            if num_columns * num_rows > 1:
+                num_patches += num_columns * num_rows
+
+        return num_patches
+
+
+__all__ = ["GotOcr2ImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py b/phivenv/Lib/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7da9129910fde906128a55bae7c87873fce94c6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py
@@ -0,0 +1,843 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/got_ocr2/modular_got_ocr2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_got_ocr2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import collections
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from transformers.utils.generic import check_model_inputs
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ..auto import AutoModel
+from .configuration_got_ocr2 import GotOcr2Config, GotOcr2VisionConfig
+
+
+class GotOcr2MLPBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
+        self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        return hidden_states
+
+
+class GotOcr2VisionAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(self, config, window_size):
+        super().__init__()
+        input_size = (
+            (config.image_size // config.patch_size, config.image_size // config.patch_size)
+            if window_size == 0
+            else (window_size, window_size)
+        )
+
+        self.num_attention_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.use_rel_pos = config.use_rel_pos
+        if self.use_rel_pos:
+            if input_size is None:
+                raise ValueError("Input size must be provided if using relative positional encoding.")
+
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+        """
+        Get relative positional embeddings according to the relative positions of
+            query and key sizes.
+
+        Args:
+            q_size (int):
+                size of the query.
+            k_size (int):
+                size of key k.
+            rel_pos (`torch.Tensor`):
+                relative position embeddings (L, channel).
+
+        Returns:
+            Extracted positional embeddings according to relative positions.
+        """
+        max_rel_dist = int(2 * max(q_size, k_size) - 1)
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+
+        # Scale the coords with short length if shapes for q and k are different.
+        q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+        k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+        return rel_pos_resized[relative_coords.long()]
+
+    def get_decomposed_rel_pos(
+        self,
+        query: torch.Tensor,
+        rel_pos_h: torch.Tensor,
+        rel_pos_w: torch.Tensor,
+        q_size: tuple[int, int],
+        k_size: tuple[int, int],
+    ) -> torch.Tensor:
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
+
+        Args:
+            query (`torch.Tensor`):
+                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
+            rel_pos_h (`torch.Tensor`):
+                relative position embeddings (Lh, channel) for height axis.
+            rel_pos_w (`torch.Tensor`):
+                relative position embeddings (Lw, channel) for width axis.
+            q_size (tuple):
+                spatial sequence size of query q with (query_height, query_width).
+            k_size (tuple):
+                spatial sequence size of key k with (key_height, key_width).
+
+        Returns:
+            decomposed_rel_pos (`torch.Tensor`):
+                decomposed relative position embeddings.
+        """
+        query_height, query_width = q_size
+        key_height, key_width = k_size
+        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
+        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
+
+        batch_size, _, dim = query.shape
+        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
+        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
+        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
+
+        decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+
+        return decomposed_rel_pos
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, height, width, _ = hidden_states.shape
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = (
+            self.qkv(hidden_states)
+            .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
+
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            decomposed_rel_pos = self.get_decomposed_rel_pos(
+                query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+            decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights)
+            attn_weights = attn_weights + decomposed_rel_pos
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+
+        attn_output = self.proj(attn_output)
+        return attn_output, attn_weights
+
+
+class GotOcr2VisionLayer(GradientCheckpointingLayer):
+    def __init__(self, config, window_size):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = GotOcr2VisionAttention(config, window_size)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = GotOcr2MLPBlock(config)
+        self.window_size = window_size
+
+    def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
+        """
+        Args:
+        Partition into non-overlapping windows with padding if needed.
+            hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window
+            size.
+
+        Returns:
+            windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel].
+            (pad_height, pad_width): padded height and width before partition
+        """
+        batch_size, height, width, channel = hidden_states.shape
+
+        pad_h = (window_size - height % window_size) % window_size
+        pad_w = (window_size - width % window_size) % window_size
+        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
+        pad_height, pad_width = height + pad_h, width + pad_w
+
+        hidden_states = hidden_states.reshape(
+            batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel
+        )
+        windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel)
+        return windows, (pad_height, pad_width)
+
+    def window_unpartition(
+        self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int]
+    ) -> torch.Tensor:
+        """
+        Args:
+        Window unpartition into original sequences and removing padding.
+            hidden_states (tensor):
+                input tokens with [batch_size * num_windows, window_size, window_size, channel].
+            window_size (int):
+                window size.
+            padding_shape (Tuple):
+                padded height and width (pad_height, pad_width).
+            original_shape (Tuple): original height and width (height, width) before padding.
+
+        Returns:
+            hidden_states: unpartitioned sequences with [batch_size, height, width, channel].
+        """
+        pad_height, pad_width = padding_shape
+        height, width = original_shape
+        batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size)
+        hidden_states = windows.reshape(
+            batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1
+        )
+        hidden_states = (
+            hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1)
+        )
+
+        hidden_states = hidden_states[:, :height, :width, :].contiguous()
+        return hidden_states
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        # Window partition
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
+
+        hidden_states, attn_weights = self.attn(
+            hidden_states=hidden_states,
+        )
+        # Reverse window partition
+        if self.window_size > 0:
+            hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
+
+        hidden_states = residual + hidden_states
+        layernorm_output = self.layer_norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(layernorm_output)
+        return hidden_states
+
+
+@auto_docstring
+class GotOcr2PreTrainedModel(PreTrainedModel):
+    config: GotOcr2Config
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = False
+    _supports_sdpa = False
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = False
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, GotOcr2VisionAttention):
+            if module.use_rel_pos:
+                module.rel_pos_h.data.zero_()
+                module.rel_pos_w.data.zero_()
+        elif isinstance(module, GotOcr2VisionEncoder):
+            if module.pos_embed is not None:
+                module.pos_embed.data.zero_()
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for got_ocr2 vision model's outputs that also contains image embeddings obtained by applying the projection
+    layer to the pooler_output.
+    """
+)
+class GotOcr2VisionEncoderOutput(ModelOutput):
+    r"""
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The image embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class GotOcr2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
+        return embeddings
+
+
+class GotOcr2LayerNorm(nn.LayerNorm):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
+        super().__init__(normalized_shape, eps=eps, **kwargs)
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {data_format}")
+        self.data_format = data_format
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
+        """
+        if self.data_format == "channels_first":
+            features = features.permute(0, 2, 3, 1)
+            features = super().forward(features)
+            features = features.permute(0, 3, 1, 2)
+        else:
+            features = super().forward(features)
+        return features
+
+
+class GotOcr2VisionNeck(nn.Module):
+    def __init__(self, config: GotOcr2VisionConfig):
+        super().__init__()
+        self.config = config
+
+        self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False)
+        self.layer_norm1 = GotOcr2LayerNorm(config.output_channels, data_format="channels_first")
+        self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
+        self.layer_norm2 = GotOcr2LayerNorm(config.output_channels, data_format="channels_first")
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm1(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        return hidden_states
+
+
+class GotOcr2VisionEncoder(GotOcr2PreTrainedModel):
+    _can_record_outputs = {"hidden_states": GotOcr2VisionLayer, "attentions": GotOcr2VisionAttention}
+
+    def __init__(self, config: GotOcr2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_embed = GotOcr2PatchEmbeddings(config)
+
+        self.pos_embed = None
+        if config.use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1,
+                    config.image_size // config.patch_size,
+                    config.image_size // config.patch_size,
+                    config.hidden_size,
+                )
+            )
+
+        self.layers = nn.ModuleList()
+        for i in range(config.num_hidden_layers):
+            layer = GotOcr2VisionLayer(
+                config,
+                window_size=config.window_size if i not in config.global_attn_indexes else 0,
+            )
+            self.layers.append(layer)
+
+        self.neck = GotOcr2VisionNeck(config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.patch_embed
+
+    @check_model_inputs
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor] = None, **kwargs: Unpack[TransformersKwargs]
+    ) -> GotOcr2VisionEncoderOutput:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.patch_embed(pixel_values)
+        if self.pos_embed is not None:
+            hidden_states = hidden_states + self.pos_embed
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
+        hidden_states = self.neck(hidden_states)
+        return GotOcr2VisionEncoderOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+class GotOcr2MultiModalProjector(nn.Module):
+    def __init__(self, config: GotOcr2Config):
+        super().__init__()
+        vision_output_channels = config.vision_config.output_channels
+        language_hidden_size = config.text_config.hidden_size
+        self.conv_upsampler1 = nn.Conv2d(
+            vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.conv_upsampler2 = nn.Conv2d(
+            vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size)
+
+    def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv_upsampler1(vision_embeddings)
+        hidden_state = self.conv_upsampler2(hidden_state)
+        hidden_state = hidden_state.flatten(2).permute(0, 2, 1)
+        hidden_state = self.multimodal_projector(hidden_state)
+        return hidden_state
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for GotOcr2 causal language model (or autoregressive) outputs.
+    """
+)
+class GotOcr2CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for GotOcr2 outputs, with hidden states and attentions.
+    """
+)
+class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The GotOcr2 model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class GotOcr2Model(GotOcr2PreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: GotOcr2Config):
+        super().__init__(config)
+        self.vision_tower = GotOcr2VisionEncoder(config.vision_config)
+
+        self.multi_modal_projector = GotOcr2MultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values).last_hidden_state
+        return self.multi_modal_projector(image_outputs)
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, GotOcr2ModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return GotOcr2ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GOT_OCR2 model which consists of a vision backbone and a language model.
+    """
+)
+class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GotOcr2Config):
+        super().__init__(config)
+        self.model = GotOcr2Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            **kwargs,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GotOcr2ForConditionalGeneration, TextStreamer
+
+        >>> model = GotOcr2ForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda")
+        >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda")
+
+        >>> # Generate
+        >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        >>> generate_ids = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     tokenizer = processor.tokenizer,
+        ...     stop_strings='<|im_end|>',
+        ...     streamer=streamer,
+        ...     max_new_tokens=4096,
+        ... )
+        "You should keep in mind what features from the module should be used, especially
+        when you're planning to sell a template."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return GotOcr2CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["GotOcr2PreTrainedModel", "GotOcr2Model", "GotOcr2ForConditionalGeneration"]
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py b/phivenv/Lib/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51e4b12c79878179fd7eaacf7a8734153e51d8d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py
@@ -0,0 +1,484 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache
+from ...configuration_utils import PretrainedConfig
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+from ..llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
+    LlavaForConditionalGeneration,
+    LlavaModel,
+    LlavaModelOutputWithPast,
+    LlavaPreTrainedModel,
+    TransformersKwargs,
+)
+from ..sam.modeling_sam import (
+    SamMLPBlock,
+    SamPreTrainedModel,
+    SamVisionAttention,
+    SamVisionEncoder,
+    SamVisionLayer,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class GotOcr2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GotOcr2VisionModel`]. It is used to instantiate a GOT_OCR2
+    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    defaults will yield a similar configuration to that of the SAM ViT-h
+    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        output_channels (`int`, *optional*, defaults to 256):
+            Dimensionality of the output channels in the Patch Encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        image_size (`int`, *optional*, defaults to 1024):
+            Expected resolution. Target size of the resized input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of the patches to be extracted from the input image.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string)
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to query, key, value projections.
+        use_abs_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use absolute position embedding.
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use relative position embedding.
+        window_size (`int`, *optional*, defaults to 14):
+            Window size for relative position.
+        global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
+            The indexes of the global attention layers.
+        mlp_dim (`int`, *optional*, defaults to 3072):
+            The dimensionality of the MLP layer in the Transformer encoder.
+    """
+
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        output_channels=256,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        mlp_dim=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.output_channels = output_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.qkv_bias = qkv_bias
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.mlp_dim = mlp_dim
+
+
+class GotOcr2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GotOcr2ForConditionalGeneration`]. It is used to instantiate a
+    GotOcr2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of GOT-OCR-2.0.
+
+    e.g [stepfun-ai/GOT-OCR-2.0-hf](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 151859):
+            The image token index to encode the image prompt.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        pad_token_id (`int`, *optional*, defaults to -1):
+            Padding token id.
+
+    ```python
+    >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config
+
+    >>> # Initializing a GotOcr2 style configuration
+    >>> configuration = GotOcr2Config()
+
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = GotOcr2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "got_ocr2"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": GotOcr2VisionConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=151859,
+        image_seq_length=576,
+        pad_token_id=-1,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.image_seq_length = image_seq_length
+        self.pad_token_id = pad_token_id
+
+        if vision_config is None:
+            self.vision_config = GotOcr2VisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = GotOcr2VisionConfig(**vision_config)
+        elif isinstance(vision_config, GotOcr2VisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "qwen2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"](
+                vocab_size=151860,
+                hidden_size=1024,
+                intermediate_size=2816,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                num_key_value_heads=16,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                tie_word_embeddings=True,
+                rope_theta=1000000.0,
+                rope_scaling=None,
+                use_sliding_window=False,
+                sliding_window=4096,
+                max_window_layers=21,
+                attention_dropout=0.0,
+            )
+
+        self.text_config = text_config
+
+        super().__init__(**kwargs)
+
+
+class GotOcr2MLPBlock(SamMLPBlock):
+    pass
+
+
+class GotOcr2VisionAttention(SamVisionAttention):
+    pass
+
+
+class GotOcr2VisionLayer(SamVisionLayer):
+    def __init__(self, config, window_size):
+        super().__init__(config, window_size)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = GotOcr2VisionAttention(config, window_size)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = GotOcr2MLPBlock(config)
+        self.window_size = window_size
+
+
+class GotOcr2PreTrainedModel(SamPreTrainedModel):
+    pass
+
+
+class GotOcr2VisionEncoder(SamVisionEncoder, GotOcr2PreTrainedModel):
+    pass
+
+
+class GotOcr2MultiModalProjector(nn.Module):
+    def __init__(self, config: GotOcr2Config):
+        super().__init__()
+        vision_output_channels = config.vision_config.output_channels
+        language_hidden_size = config.text_config.hidden_size
+        self.conv_upsampler1 = nn.Conv2d(
+            vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.conv_upsampler2 = nn.Conv2d(
+            vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size)
+
+    def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv_upsampler1(vision_embeddings)
+        hidden_state = self.conv_upsampler2(hidden_state)
+        hidden_state = hidden_state.flatten(2).permute(0, 2, 1)
+        hidden_state = self.multimodal_projector(hidden_state)
+        return hidden_state
+
+
+class GotOcr2CausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    pass
+
+
+class GotOcr2ModelOutputWithPast(LlavaModelOutputWithPast):
+    pass
+
+
+class GotOcr2PreTrainedModel(LlavaPreTrainedModel):
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_flex_attn = False
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, GotOcr2VisionAttention):
+            if module.use_rel_pos:
+                module.rel_pos_h.data.zero_()
+                module.rel_pos_w.data.zero_()
+        elif isinstance(module, GotOcr2VisionEncoder):
+            if module.pos_embed is not None:
+                module.pos_embed.data.zero_()
+
+
+class GotOcr2Model(LlavaModel):
+    def __init__(self, config: GotOcr2Config):
+        super().__init__(config)
+        self.vision_tower = GotOcr2VisionEncoder(config.vision_config)
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values).last_hidden_state
+        return self.multi_modal_projector(image_outputs)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, GotOcr2ModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return GotOcr2ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class GotOcr2ForConditionalGeneration(LlavaForConditionalGeneration):
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GotOcr2ForConditionalGeneration, TextStreamer
+
+        >>> model = GotOcr2ForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda")
+        >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda")
+
+        >>> # Generate
+        >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        >>> generate_ids = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     tokenizer = processor.tokenizer,
+        ...     stop_strings='<|im_end|>',
+        ...     streamer=streamer,
+        ...     max_new_tokens=4096,
+        ... )
+        "You should keep in mind what features from the module should be used, especially
+        when you're planning to sell a template."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return GotOcr2CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+
+__all__ = [
+    "GotOcr2VisionConfig",
+    "GotOcr2Config",
+    "GotOcr2PreTrainedModel",
+    "GotOcr2Model",
+    "GotOcr2ForConditionalGeneration",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py b/phivenv/Lib/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a08ed8c78f8192ef26faaaa5724d1f02882daa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -0,0 +1,261 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+import numpy as np
+
+from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...utils import is_vision_available, logging
+
+
+if is_vision_available():
+    from ...image_utils import load_images
+
+logger = logging.get_logger(__name__)
+
+
+class GotOcr2TextKwargs(TextKwargs, total=False):
+    format: Optional[bool]
+
+
+class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
+    box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
+    color: Optional[str]
+    num_image_tokens: Optional[int]
+    multi_page: Optional[bool]
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: GotOcr2TextKwargs
+    images_kwargs: GotOcr2ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "format": False,
+        },
+        "images_kwargs": {
+            "num_image_tokens": 256,
+            "multi_page": False,
+            "crop_to_patches": False,
+            "min_patches": 1,
+            "max_patches": 12,
+        },
+    }
+
+
+def preprocess_box_annotation(box: Union[list, tuple], image_size: tuple[int, int]) -> list:
+    """
+    Convert box annotation to the format [x1, y1, x2, y2] in the range [0, 1000].
+    """
+    width, height = image_size
+    if len(box) == 4:
+        box[0] = int(box[0] / width * 1000)
+        box[1] = int(box[1] / height * 1000)
+        box[2] = int(box[2] / width * 1000)
+        box[3] = int(box[3] / height * 1000)
+    else:
+        raise ValueError("Box must be a list or tuple of lists in the form [x1, y1, x2, y2].")
+
+    return list(box)
+
+
+class GotOcr2Processor(ProcessorMixin):
+    r"""
+    Constructs a GotOcr2 processor which wraps a [`GotOcr2ImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~GotOcr2Processor.__call__`] and [`~GotOcr2Processor.decode`] for more information.
+    Args:
+        image_processor ([`GotOcr2ImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        self.message_start_token = "<|im_start|>"
+        self.message_end_token = "<|im_end|>"
+        self.img_start_token = "<img>"
+        self.img_end_token = "</img>"
+        self.img_pad_token = "<imgpad>"
+        self.image_token = "<imgpad>"  # keep the above for BC, but we need to call it `image_token`
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
+
+    def _make_list_of_inputs(self, images, text, box, color, multi_page):
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+            if multi_page:
+                logger.warning("Multi-page inference is enabled but only one image is passed.")
+                images = [images]
+        elif isinstance(images[0], (list, tuple)) and not multi_page:
+            raise ValueError("Nested images are only supported with `multi_page` set to `True`.")
+        elif not isinstance(images[0], (list, tuple)) and multi_page:
+            images = [images]
+
+        if isinstance(text, str):
+            text = [text]
+
+        if not isinstance(box[0], (list, tuple)):
+            # Use the same box for all images
+            box = [box for _ in range(len(images))]
+        if not isinstance(color, (list, tuple)):
+            color = [color for _ in range(len(images))]
+
+        return images, text, box, color
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[GotOcr2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
+        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
+        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
+        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            format (`bool`, *optional*):
+                If set, will add the format token to the query, and the model will return the OCR result with formatting.
+            box (`list[float]`, `list[tuple[float, float]]`, `list[tuple[float, float, float, float]]`, *optional*):
+                The box annotation to be added to the query. If a list of floats or a tuple of floats is provided, it
+                will be interpreted as [x1, y1, x2, y2]. If a list of tuples is provided, each tuple should be in the
+                form (x1, y1, x2, y2).
+            color (`str`, *optional*):
+                The color annotation to be added to the query. The model will return the OCR result within the box with
+                the specified color.
+            multi_page (`bool`, *optional*):
+                If set, will enable multi-page inference. The model will return the OCR result across multiple pages.
+            crop_to_patches (`bool`, *optional*):
+                If set, will crop the image to patches. The model will return the OCR result upon the patch reference.
+            min_patches (`int`, *optional*):
+                The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
+                `True`.
+            max_patches (`int`, *optional*):
+                The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
+                `True`.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        output_kwargs = self._merge_kwargs(
+            GotOcr2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        format_output = output_kwargs["text_kwargs"].pop("format")
+        num_image_tokens = output_kwargs["images_kwargs"].pop("num_image_tokens")
+        box = output_kwargs["images_kwargs"].pop("box", [None])
+        color = output_kwargs["images_kwargs"].pop("color", None)
+        multi_page = output_kwargs["images_kwargs"].pop("multi_page")
+
+        crop_to_patches = output_kwargs["images_kwargs"].get("crop_to_patches")
+        images, text, box, color = self._make_list_of_inputs(images, text, box, color, multi_page)
+        if multi_page:
+            # save the number of pages per batch
+            num_pages_per_batch = [len(image_group) for image_group in images]
+            # flatten the list of images
+            images = [image for image_group in images for image in image_group]
+        else:
+            num_pages_per_batch = [1 for _ in range(len(images))]
+        # Load images as we need to know the image size
+        images = load_images(images)
+        image_sizes = [image.size for image in images]
+        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+        num_patches_array = image_inputs.pop("num_patches")
+        if text is None:
+            text = []
+            patch_indices = np.cumsum(num_pages_per_batch)
+            for index, (num_pages, box_single, color_single) in enumerate(zip(num_pages_per_batch, box, color)):
+                current_patch_index = patch_indices[index - 1] if index > 0 else 0
+                num_patches = sum(num_patches_array[current_patch_index : current_patch_index + num_pages])
+                if box_single[0] is not None:
+                    box_single = preprocess_box_annotation(box_single, image_sizes[index])
+                query = (
+                    f"{f'[{color_single}] ' if color_single is not None else ''}"
+                    f"{str(box_single) if box_single[0] is not None else ''} "
+                    "OCR"
+                    f"{' with format' if format_output else ''}"
+                    f"{' across multi pages' if multi_page else ''}"
+                    f"{' upon the patch reference' if crop_to_patches else ''}"
+                    ": "
+                )
+                prompt = (
+                    self.message_start_token
+                    + self.system_query
+                    + self.message_end_token
+                    + self.message_start_token
+                    + "user\n"
+                    + self.img_start_token
+                    + self.img_pad_token * num_image_tokens * num_patches
+                    + self.img_end_token
+                    + "\n"
+                    + query
+                    + self.message_end_token
+                    + self.message_start_token
+                    + "assistant\n"
+                )
+                text.append(prompt)
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+
+__all__ = ["GotOcr2Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__init__.py b/phivenv/Lib/site-packages/transformers/models/gpt2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f01899e668e3a86548db3f59c7f42d70746385ab
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt2 import *
+    from .modeling_flax_gpt2 import *
+    from .modeling_gpt2 import *
+    from .modeling_tf_gpt2 import *
+    from .tokenization_gpt2 import *
+    from .tokenization_gpt2_fast import *
+    from .tokenization_gpt2_tf import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c0570990677bbfd5e9a49f6e63c85687c170b2d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0729aa59af131166c30412e7630beda2f9bd2480
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20628772626c8edf47cf35a8f9693eb62e5ecce2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55ff771e61f5584c7d1bb17740ef97134dd9e761
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60bff23b9d0763450c1fca244b9d8ba7aaca6444
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ada16ebfcc0216a59928e216526724892ee19779
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d79f93134989277c9459d403b495e346339acd05
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a7d3a831c84942b2c5b3397705484b2489cf5e7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/configuration_gpt2.py b/phivenv/Lib/site-packages/transformers/models/gpt2/configuration_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..db5151a2ba15635a7943744799b0689fc96790d3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/configuration_gpt2.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OpenAI GPT-2 configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PreTrainedTokenizer, TensorType, is_torch_available
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfigWithPast, PatchingSpec
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPT2Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
+    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPT-2
+    [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`GPT2DoubleHeadsModel`].
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            Id of the end of sentence token in the vocabulary.
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPT2Config, GPT2Model
+
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = GPT2Config()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPT2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+class GPT2OnnxConfig(OnnxConfigWithPast):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+        if not getattr(self._config, "pad_token_id", None):
+            # TODO: how to do that better?
+            self._config.pad_token_id = 0
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                past_shape = (
+                    batch,
+                    self.num_attention_heads,
+                    past_key_values_length,
+                    self._config.hidden_size // self.num_attention_heads,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            mask_dtype = ordered_inputs["attention_mask"].dtype
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
+
+
+__all__ = ["GPT2Config", "GPT2OnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py b/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e419217c5a3642ee27f6f3df87e1c27c0d5ac79
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -0,0 +1,782 @@
+# coding=utf-8
+# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_gpt2 import GPT2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
+_CONFIG_FOR_DOC = "GPT2Config"
+
+
+GPT2_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxConv1D(nn.Module):
+    features: int
+    use_bias: bool = True
+    dtype: Any = jnp.float32
+    precision: Any = None
+
+    @nn.compact
+    def __call__(self, inputs):
+        inputs = jnp.asarray(inputs, self.dtype)
+        kernel = self.param("kernel", jax.nn.initializers.normal(stddev=0.02), (self.features, inputs.shape[-1]))
+        kernel = jnp.asarray(kernel.transpose(), self.dtype)
+        y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())), precision=self.precision)
+        if self.use_bias:
+            bias = self.param("bias", jax.nn.initializers.zeros, (self.features,))
+            bias = jnp.asarray(bias, self.dtype)
+            y = y + bias
+        return y
+
+
+class FlaxGPT2Attention(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
+
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        if self.is_cross_attention:
+            self.c_attn = FlaxConv1D(2 * self.embed_dim, dtype=self.dtype)
+            self.q_attn = FlaxConv1D(self.embed_dim, dtype=self.dtype)
+        else:
+            self.c_attn = FlaxConv1D(3 * self.embed_dim, dtype=self.dtype)
+        self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
+
+        self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        if not is_cross_attention:
+            qkv_out = self.c_attn(hidden_states)
+            query, key, value = jnp.split(qkv_out, 3, axis=2)
+        else:
+            q_out = self.q_attn(hidden_states)
+            (query,) = jnp.split(q_out, 1, axis=2)
+            kv_out = self.c_attn(key_value_states)
+            key, value = jnp.split(kv_out, 2, axis=2)
+
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+
+        query_length, key_length = query.shape[1], key.shape[1]
+
+        if self.causal:
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        dropout_rng = None
+        if not deterministic and self.config.attn_pdrop > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        if attention_mask is not None:
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        # usual dot product attention
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attn_pdrop,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxGPT2MLP(nn.Module):
+    config: GPT2Config
+    intermediate_size: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        self.c_fc = FlaxConv1D(self.intermediate_size, dtype=self.dtype)
+        self.c_proj = FlaxConv1D(embed_dim, dtype=self.dtype)
+        self.act = ACT2FN[self.config.activation_function]
+        self.dropout = nn.Dropout(rate=self.config.resid_pdrop)
+
+    def __call__(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxGPT2Block(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        hidden_size = self.config.hidden_size
+        inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
+        self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxGPT2Attention(
+                config=self.config, dtype=self.dtype, causal=False, is_cross_attention=True
+            )
+            self.ln_cross_attn = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = attn_outputs[0]  # output_attn: a, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[1:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        outputs = (hidden_states,) + outputs
+
+        return outputs
+
+
+class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPT2Config
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                position_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        params: Optional[dict] = None,
+        past_key_values: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if encoder_hidden_states is not None and encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPT2Attention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            encoder_hidden_states,
+            encoder_attention_mask,
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxGPT2BlockCollection(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxGPT2Block(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = block(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # this contains possible `None` values - `FlaxGPT2Module` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+
+        return outputs
+
+
+class FlaxGPT2Module(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+
+        self.wte = nn.Embed(
+            self.config.vocab_size,
+            self.embed_dim,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.wpe = nn.Embed(
+            self.config.max_position_embeddings,
+            self.embed_dim,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
+        self.h = FlaxGPT2BlockCollection(self.config, dtype=self.dtype)
+        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.wte(input_ids.astype("i4"))
+        position_embeds = self.wpe(position_ids.astype("i4"))
+
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+
+        outputs = self.h(
+            hidden_states,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[2],
+            cross_attentions=outputs[3],
+        )
+
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class FlaxGPT2Model(FlaxGPT2PreTrainedModel):
+    module_class = FlaxGPT2Module
+
+
+append_call_sample_docstring(
+    FlaxGPT2Model,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxGPT2LMHeadModule(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.transformer = FlaxGPT2Module(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask,
+            position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=lm_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
+    module_class = FlaxGPT2LMHeadModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPT2 uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(
+                extended_attention_mask, attention_mask.astype("i4"), (0, 0)
+            )
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxGPT2LMHeadModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = ["FlaxGPT2LMHeadModel", "FlaxGPT2Model", "FlaxGPT2PreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_gpt2.py b/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..375479f19780a9cef59eb6f0cb9e764c4ded1814
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_gpt2.py
@@ -0,0 +1,1640 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, get_activation
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    auto_docstring,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.model_parallel_utils import assert_device_map, get_device_map
+from .configuration_gpt2 import GPT2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import re
+
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split("/")
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+    if module.scale_attn_weights:
+        attn_weights = attn_weights / torch.full(
+            [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+        )
+
+    # Layer-wise attention scaling
+    if module.scale_attn_by_inverse_layer_idx:
+        attn_weights = attn_weights / float(module.layer_idx + 1)
+
+    if not module.is_cross_attention:
+        # if only "normal" attention layer implements causal mask
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length]
+        mask_value = torch.finfo(attn_weights.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+        attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
+
+    if attention_mask is not None:
+        # Apply the attention mask
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+    attn_weights = attn_weights.type(value.dtype)
+    attn_weights = module.attn_dropout(attn_weights)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2)
+
+    return attn_output, attn_weights
+
+
+class GPT2Attention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        self.config = config
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.is_causal = True
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with torch.autocast(query.device.type, enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        attn_output = attn_output.transpose(1, 2)
+
+        return attn_output, attn_weights
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        if is_cross_attention:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+            query_states = self.q_attn(hidden_states)
+            attention_mask = encoder_attention_mask
+
+            # Try to get key/value states from cache if possible
+            if past_key_values is not None and is_updated:
+                key_states = curr_past_key_value.layers[self.layer_idx].keys
+                value_states = curr_past_key_value.layers[self.layer_idx].values
+            else:
+                key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+                shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
+                key_states = key_states.view(shape_kv).transpose(1, 2)
+                value_states = value_states.view(shape_kv).transpose(1, 2)
+        else:
+            query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+            shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
+            key_states = key_states.view(shape_kv).transpose(1, 2)
+            value_states = value_states.view(shape_kv).transpose(1, 2)
+
+        shape_q = (*query_states.shape[:-1], -1, self.head_dim)
+        query_states = query_states.view(shape_q).transpose(1, 2)
+
+        if (past_key_values is not None and not is_cross_attention) or (
+            past_key_values is not None and is_cross_attention and not is_updated
+        ):
+            # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+            cache_position = cache_position if not is_cross_attention else None
+            key_states, value_states = curr_past_key_value.update(
+                key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+            )
+            # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+            if is_cross_attention:
+                past_key_values.is_updated[self.layer_idx] = True
+
+        is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention
+
+        using_eager = self.config._attn_implementation == "eager"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        if using_eager and self.reorder_and_upcast_attn:
+            attn_output, attn_weights = self._upcast_and_reordered_attn(
+                query_states, key_states, value_states, attention_mask, head_mask
+            )
+        else:
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                head_mask=head_mask,
+                dropout=self.attn_dropout.p if self.training else 0.0,
+                is_causal=is_causal,
+                **kwargs,
+            )
+
+        attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous()
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights
+
+
+class GPT2MLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPT2Block(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(config=config, layer_idx=layer_idx)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            self.crossattention = GPT2Attention(config=config, is_cross_attention=True, layer_idx=layer_idx)
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = GPT2MLP(inner_dim, config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output, self_attn_weights = self.attn(
+            hidden_states,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_output, cross_attn_weights = self.crossattention(
+                hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            # residual connection
+            hidden_states = residual + cross_attn_output
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->GPT2
+class GPT2SequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`GPT2Config`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: GPT2Config):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+@auto_docstring
+class GPT2PreTrainedModel(PreTrainedModel):
+    config: GPT2Config
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPT2Block"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_attention_backend = True
+
+    _can_compile_fullgraph = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name == "c_proj.weight":
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+    """
+)
+class GPT2DoubleHeadsModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss.
+    mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
+        Multiple choice classification loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
+        Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
+    past_key_values (`tuple[tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
+        sequence_length, embed_size_per_head)`).
+
+        Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    mc_loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mc_logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (`dict[int, list]`, *optional*):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
+            following number of attention modules:
+
+                - openai-community/gpt2: 12
+                - openai-community/gpt2-medium: 24
+                - openai-community/gpt2-large: 36
+                - openai-community/gpt2-xl: 48
+
+    Example:
+
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
+        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example:
+
+    ```python
+    # On a 4 GPU machine with openai-community/gpt2-large:
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6, 7],
+        1: [8, 9, 10, 11, 12, 13, 14, 15],
+        2: [16, 17, 18, 19, 20, 21, 22, 23],
+        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+
+
+@auto_docstring
+class GPT2Model(GPT2PreTrainedModel):
+    _supports_param_buffer_assignment = False
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        self._attn_implementation = config._attn_implementation
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        warnings.warn(
+            "`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
+            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
+            " ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        self.wpe = self.wpe.to(self.first_device)
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for block in v:
+                cuda_device = "cuda:" + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        # ln_f to last
+        self.ln_f = self.ln_f.to(self.last_device)
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        self.wpe = self.wpe.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[tuple[tuple[torch.Tensor]], Cache]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = DynamicCache(config=self.config)
+            elif isinstance(past_key_values, tuple):
+                logger.warning_once(
+                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. "
+                    "You should pass an instance of `Cache` instead, e.g. "
+                    "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`."
+                )
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+            if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache):
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config))
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
+
+        # Attention mask.
+        # ._update_causal_mask() and ._prepare_4d_causal_attention_mask_with_cache_position() copied from LlamaModel
+        if attention_mask is not None and attention_mask.ndim < 4:
+            attention_mask = attention_mask.view(batch_size, -1)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            if _use_sdpa:
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    mask=encoder_attention_mask, dtype=inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+            elif self._attn_implementation != "flash_attention_2":
+                encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                past_key_values if not (self.gradient_checkpointing and self.training) else None,
+                cache_position,
+                causal_mask,
+                head_mask[i],
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                **kwargs,
+            )
+
+            hidden_states = outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[2],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        past_key_values = past_key_values if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class GPT2LMHeadModel(GPT2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
+            " 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+        The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
+    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
+    input embeddings, the classification head takes as input the input of a specified classification token index in the
+    input sequence).
+    """
+)
+class GPT2DoubleHeadsModel(GPT2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        config.num_labels = 1
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = GPT2SequenceSummary(config)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should"
+            " load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your"
+            " own `device_map` but it needs to be a dictionary module_name to device, so for instance"
+            " {'transformer.h.0': 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.multiple_choice_head = self.multiple_choice_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        mc_token_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        mc_labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, GPT2DoubleHeadsModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
+            1]`.
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to
+            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
+        mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
+
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
+        >>> # Update the model embeddings with the new vocabulary size
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))
+
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_logits = outputs.logits
+        >>> mc_logits = outputs.mc_logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+
+        mc_loss = None
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
+        lm_loss = None
+        if labels is not None:
+            labels = labels.to(lm_logits.device)
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits, mc_logits) + transformer_outputs[1:]
+            if mc_loss is not None:
+                output = (mc_loss,) + output
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return GPT2DoubleHeadsModelOutput(
+            loss=lm_loss,
+            mc_loss=mc_loss,
+            logits=lm_logits,
+            mc_logits=mc_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class GPT2ForSequenceClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPT2ForTokenClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = GPT2Model(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPT2DoubleHeadsModel",
+    "GPT2ForQuestionAnswering",
+    "GPT2ForSequenceClassification",
+    "GPT2ForTokenClassification",
+    "GPT2LMHeadModel",
+    "GPT2Model",
+    "GPT2PreTrainedModel",
+    "load_tf_weights_in_gpt2",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py b/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..42e23fc290151f09d47a30efca1cb7f4e4a3d669
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -0,0 +1,1238 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 OpenAI GPT-2 model."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+    TFSequenceClassifierOutputWithPast,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFConv1D,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_gpt2 import GPT2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
+_CONFIG_FOR_DOC = "GPT2Config"
+
+
+class TFAttention(keras.layers.Layer):
+    def __init__(self, nx, config, scale=False, is_cross_attention=False, **kwargs):
+        super().__init__(**kwargs)
+
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
+        assert n_state % config.n_head == 0
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.output_attentions = config.output_attentions
+
+        self.is_cross_attention = is_cross_attention
+
+        if self.is_cross_attention:
+            self.c_attn = TFConv1D(n_state * 2, nx, initializer_range=config.initializer_range, name="c_attn")
+            self.q_attn = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="q_attn")
+        else:
+            self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
+        self.attn_dropout = keras.layers.Dropout(config.attn_pdrop)
+        self.resid_dropout = keras.layers.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+        self.embed_dim = n_state
+
+    def prune_heads(self, heads):
+        pass
+
+    @staticmethod
+    def causal_attention_mask(nd, ns, dtype):
+        """
+        1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]),
+        -1, ns-nd), but doesn't produce garbage on TPUs.
+        """
+        i = tf.range(nd)[:, None]
+        j = tf.range(ns)
+        m = i >= j - ns + nd
+        return tf.cast(m, dtype)
+
+    def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False):
+        # q, k, v have shape [batch, heads, sequence, features]
+        w = tf.matmul(q, k, transpose_b=True)
+        if self.scale:
+            dk = tf.cast(shape_list(k)[-1], dtype=w.dtype)  # scale attention_scores
+            w = w / tf.math.sqrt(dk)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+
+            # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
+            _, _, nd, ns = shape_list(w)
+            b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
+            b = tf.reshape(b, [1, 1, nd, ns])
+            w = w * b - 1e4 * (1 - b)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attention_mask = tf.cast(attention_mask, dtype=w.dtype)
+            w = w + attention_mask
+
+        w = stable_softmax(w, axis=-1)
+        w = self.attn_dropout(w, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [tf.matmul(w, v)]
+        if output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = tf.transpose(x, [0, 2, 1, 3])
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
+        return tf.reshape(x, new_x_shape)
+
+    def split_heads(self, x):
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
+
+    def call(
+        self,
+        x,
+        layer_past,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+        training=False,
+    ):
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+
+            query = self.q_attn(x)
+            kv_out = self.c_attn(encoder_hidden_states)
+            key, value = tf.split(kv_out, 2, axis=2)
+            attention_mask = encoder_attention_mask
+        else:
+            x = self.c_attn(x)
+            query, key, value = tf.split(x, 3, axis=2)
+
+        query = self.split_heads(query)
+        key = self.split_heads(key)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = tf.unstack(layer_past, axis=0, num=2)
+            key = tf.concat([past_key, key], axis=-2)
+            value = tf.concat([past_value, value], axis=-2)
+
+        # to cope with keras serialization
+        if use_cache:
+            present = tf.stack([key, value], axis=0)
+        else:
+            present = (None,)
+
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a, training=training)
+
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if self.is_cross_attention:
+            c_attn_shape = 2 * self.embed_dim
+        else:
+            c_attn_shape = 3 * self.embed_dim
+        if getattr(self, "c_proj", None) is not None:
+            with tf.name_scope(self.c_proj.name):
+                self.c_proj.build([None, None, self.embed_dim])
+        if getattr(self, "c_attn", None) is not None:
+            with tf.name_scope(self.c_attn.name):
+                self.c_attn.build([None, None, c_attn_shape])
+        if getattr(self, "q_attn", None) is not None:
+            with tf.name_scope(self.q_attn.name):
+                self.q_attn.build([None, None, self.embed_dim])
+
+
+class TFMLP(keras.layers.Layer):
+    def __init__(self, n_state, config, **kwargs):
+        super().__init__(**kwargs)
+        nx = config.n_embd
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
+        self.act = get_tf_activation(config.activation_function)
+        self.dropout = keras.layers.Dropout(config.resid_pdrop)
+        self.intermediate_size = n_state
+        self.embed_dim = nx
+
+    def call(self, x, training=False):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        h2 = self.dropout(h2, training=training)
+        return h2
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "c_fc", None) is not None:
+            with tf.name_scope(self.c_fc.name):
+                self.c_fc.build([None, None, self.intermediate_size])
+        if getattr(self, "c_proj", None) is not None:
+            with tf.name_scope(self.c_proj.name):
+                self.c_proj.build([None, None, self.embed_dim])
+
+
+class TFBlock(keras.layers.Layer):
+    def __init__(self, config, scale=False, **kwargs):
+        super().__init__(**kwargs)
+        nx = config.n_embd
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
+        self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.attn = TFAttention(nx, config, scale, name="attn")
+        self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
+
+        if config.add_cross_attention:
+            self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True)
+            self.ln_cross_attn = keras.layers.LayerNormalization(
+                epsilon=config.layer_norm_epsilon, name="ln_cross_attn"
+            )
+
+        self.mlp = TFMLP(inner_dim, config, name="mlp")
+        self.hidden_size = config.hidden_size
+
+    def call(
+        self,
+        x,
+        layer_past,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+        training=False,
+    ):
+        a = self.ln_1(x)
+        output_attn = self.attn(
+            a,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        a = output_attn[0]  # output_attn: a, present, (attentions)
+        outputs = output_attn[1:]
+        x = x + a
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            ca = self.ln_cross_attn(x)
+            output_cross_attn = self.crossattention(
+                ca,
+                layer_past=None,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=False,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            ca = output_cross_attn[0]  # output_attn: a, present, (cross_attentions)
+            x = x + ca
+            outputs = outputs + output_cross_attn[2:]  # add cross attentions if we output attention weights
+
+        m = self.ln_2(x)
+        m = self.mlp(m, training=training)
+        x = x + m
+
+        outputs = [x] + outputs
+        return outputs  # x, present, (attentions, cross_attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "ln_1", None) is not None:
+            with tf.name_scope(self.ln_1.name):
+                self.ln_1.build([None, None, self.hidden_size])
+        if getattr(self, "attn", None) is not None:
+            with tf.name_scope(self.attn.name):
+                self.attn.build(None)
+        if getattr(self, "ln_2", None) is not None:
+            with tf.name_scope(self.ln_2.name):
+                self.ln_2.build([None, None, self.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+        if getattr(self, "ln_cross_attn", None) is not None:
+            with tf.name_scope(self.ln_cross_attn.name):
+                self.ln_cross_attn.build([None, None, self.hidden_size])
+
+
+@keras_serializable
+class TFGPT2MainLayer(keras.layers.Layer):
+    config_class = GPT2Config
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+        self.config = config
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.use_cache = config.use_cache
+        self.return_dict = config.use_return_dict
+
+        self.num_hidden_layers = config.n_layer
+        self.n_embd = config.n_embd
+        self.n_positions = config.n_positions
+        self.initializer_range = config.initializer_range
+
+        self.wte = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wte",
+        )
+        self.wpe = keras.layers.Embedding(
+            input_dim=config.n_positions,
+            output_dim=config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wpe",
+        )
+        self.drop = keras.layers.Dropout(config.embd_pdrop)
+        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
+        self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
+        self.embed_dim = config.hidden_size
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = [None] * len(self.h)
+        else:
+            past_length = shape_list(past_key_values[0][0])[-2]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0)
+
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask_shape = shape_list(attention_mask)
+            attention_mask = tf.reshape(attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]))
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            one_cst = tf.constant(1.0)
+            attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype)
+            attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), tf.constant(-10000.0))
+
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.config.add_cross_attention and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=encoder_hidden_states.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        encoder_attention_mask = encoder_extended_attention_mask
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.wte(input_ids)
+
+        position_embeds = self.wpe(position_ids)
+
+        if token_type_ids is not None:
+            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+            token_type_embeds = self.wte(token_type_ids)
+        else:
+            token_type_embeds = tf.constant(0.0)
+
+        position_embeds = tf.cast(position_embeds, dtype=inputs_embeds.dtype)
+        token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype)
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states, training=training)
+
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
+
+        presents = () if use_cache else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+
+            outputs = block(
+                hidden_states,
+                layer_past,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,
+                encoder_attention_mask,
+                use_cache,
+                output_attentions,
+                training=training,
+            )
+
+            hidden_states, present = outputs[:2]
+            if use_cache:
+                presents = presents + (present,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (outputs[3],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = tf.reshape(hidden_states, output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wte", None) is not None:
+            with tf.name_scope(self.wte.name):
+                self.wte.build(None)
+        if getattr(self, "wpe", None) is not None:
+            with tf.name_scope(self.wpe.name):
+                self.wpe.build(None)
+        if getattr(self, "ln_f", None) is not None:
+            with tf.name_scope(self.ln_f.name):
+                self.ln_f.build([None, None, self.embed_dim])
+        if getattr(self, "h", None) is not None:
+            for layer in self.h:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFGPT2PreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPT2Config
+    base_model_prefix = "transformer"
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias", r"h.\d+.crossattention.bias"]
+
+    @property
+    def input_signature(self):
+        # Although GPT-2 supports token_type_ids in theory, in practice they are rarely used, and the implementation
+        # means that passing token_type_ids=0 yields different outputs from token_type_ids=None.
+        # Therefore, we remove the token_type_ids argument by default, even though it would usually be included.
+        return {
+            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+        }
+
+
+@dataclass
+class TFGPT2DoubleHeadsModelOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
+            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
+        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
+            sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: tf.Tensor | None = None
+    mc_logits: tf.Tensor | None = None
+    past_key_values: list[tf.Tensor] | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+GPT2_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The token ids which have
+            their past given to this model should not be passed as input ids as they have already been computed.
+        attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
+            `past_key_values`. In other words, the `attention_mask` always has to have the length:
+            `len(past_key_values) + len(input_ids)`
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2Model(TFGPT2PreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
+            their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past`). Set to `False` during training, `True` during generation
+        """
+
+        outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
+
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
+
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids")
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+            if token_type_ids is not None:
+                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
+
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "token_type_ids": token_type_ids,
+        }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
+            their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past`). Set to `False` during training, `True` during generation
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
+
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels, shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
+    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
+    input embeddings, the classification head takes as input the input of a specified classification token index in the
+    input sequence).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        mc_token_ids: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFGPT2DoubleHeadsModelOutput | tuple[tf.Tensor]:
+        r"""
+        mc_token_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
+            1]`.
+
+        Return:
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoTokenizer, TFGPT2DoubleHeadsModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFGPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
+
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
+
+        >>> embedding_layer = model.resize_token_embeddings(
+        ...     len(tokenizer)
+        ... )  # Update the model embeddings with the new vocabulary size
+
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+        >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        ```"""
+
+        if input_ids is not None:
+            input_shapes = shape_list(input_ids)
+        else:
+            input_shapes = shape_list(inputs_embeds)[:-1]
+
+        seq_length = input_shapes[-1]
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        transformer_outputs = self.transformer(
+            input_ids=flat_input_ids,
+            past_key_values=past_key_values,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
+        if return_dict and output_hidden_states:
+            # We do this to match the slightly odd PT behaviour - the final hidden state is reshaped to rank 4 when the
+            # input is rank 3, but all other hidden states remain at rank-3 (with the first 2 dims merged)
+            all_hidden_states = transformer_outputs.hidden_states[:-1] + (hidden_states,)
+        else:
+            all_hidden_states = None
+        lm_logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
+        mc_logits = tf.squeeze(mc_logits, axis=-1)
+
+        if not return_dict:
+            return (lm_logits, mc_logits) + transformer_outputs[1:]
+
+        return TFGPT2DoubleHeadsModelOutput(
+            logits=lm_logits,
+            mc_logits=mc_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
+        }
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "multiple_choice_head", None) is not None:
+            with tf.name_scope(self.multiple_choice_head.name):
+                self.multiple_choice_head.build(None)
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`TFGPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.score = keras.layers.Dense(
+            config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="score",
+            use_bias=False,
+        )
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint="microsoft/DialogRPT-updown",
+        output_type=TFSequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutputWithPast | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        logits_shape = shape_list(logits)
+        batch_size = logits_shape[0]
+
+        if self.config.pad_token_id is None:
+            last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+        else:
+            if input_ids is not None:
+                token_indices = tf.range(shape_list(input_ids)[-1])
+                non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype)
+                last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1)
+            else:
+                last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1)
+
+        if labels is not None:
+            if self.config.pad_token_id is None and logits_shape[0] != 1:
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels]))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "score", None) is not None:
+            with tf.name_scope(self.score.name):
+                self.score.build([None, None, self.config.n_embd])
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+__all__ = [
+    "TFGPT2DoubleHeadsModel",
+    "TFGPT2ForSequenceClassification",
+    "TFGPT2LMHeadModel",
+    "TFGPT2MainLayer",
+    "TFGPT2Model",
+    "TFGPT2PreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2.py b/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..608164ef2d83ab15bf7f99d33f9c6eb56ed1fcff
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2.py
@@ -0,0 +1,334 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+import json
+import os
+from functools import lru_cache
+from typing import Optional
+
+import regex as re
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+@lru_cache
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import GPT2Tokenizer
+
+    >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*):
+            The token used for padding, for example when batching sequences of different lengths.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
+        add_bos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
+            word just as any other word.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token=None,
+        add_prefix_space=False,
+        add_bos_token=False,
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        self.add_bos_token = add_bos_token
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            add_bos_token=add_bos_token,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is None:
+            return output
+
+        return output + bos_token_ids + token_ids_1
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if not self.add_bos_token:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0))
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+
+__all__ = ["GPT2Tokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py b/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..f81c155e864476cf49c24f91a0235c939f42d3e0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+from typing import Optional
+
+from ...tokenization_utils_base import BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_gpt2 import GPT2Tokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class GPT2TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import GPT2TokenizerFast
+
+    >>> tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = GPT2Tokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        self.add_bos_token = kwargs.pop("add_bos_token", False)
+
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["GPT2TokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py b/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..145a45da0db6d36f75f5cec6091027e36541184e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py
@@ -0,0 +1,119 @@
+import os
+from typing import Optional, Union
+
+import tensorflow as tf
+from tensorflow_text import pad_model_inputs
+
+from ...modeling_tf_utils import keras
+from ...utils.import_utils import is_keras_nlp_available, requires
+from .tokenization_gpt2 import GPT2Tokenizer
+
+
+if is_keras_nlp_available():
+    from keras_nlp.tokenizers import BytePairTokenizer
+
+
+@requires(backends=("keras_nlp",))
+class TFGPT2Tokenizer(keras.layers.Layer):
+    """
+    This is an in-graph tokenizer for GPT2. It should be initialized similarly to other tokenizers, using the
+    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
+    from an existing standard tokenizer object.
+
+    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
+    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
+    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
+    straight from `tf.string` inputs to outputs.
+
+    Args:
+        vocab (dict[str, int]): Vocabulary dict for Byte Pair Tokenizer
+        merges (list[str]): Merges list for Byte Pair Tokenizer
+    """
+
+    def __init__(
+        self,
+        vocab: dict[str, int],
+        merges: list[str],
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pad_token_id = pad_token_id
+        self.max_length = max_length
+        self.vocab = vocab
+        self.merges = merges
+
+        self.tf_tokenizer = BytePairTokenizer(vocab, merges, sequence_length=max_length)
+
+    @classmethod
+    def from_tokenizer(cls, tokenizer: GPT2Tokenizer, *args, **kwargs):
+        """Creates TFGPT2Tokenizer from GPT2Tokenizer
+
+        Args:
+            tokenizer (GPT2Tokenizer)
+
+        Examples:
+
+        ```python
+        from transformers import AutoTokenizer, TFGPT2Tokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        tf_tokenizer = TFGPT2Tokenizer.from_tokenizer(tokenizer)
+        ```
+        """
+        merges = [" ".join(m) for m in tokenizer.bpe_ranks]
+        vocab = tokenizer.get_vocab()
+        return cls(vocab, merges, *args, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
+        """Creates TFGPT2Tokenizer from pretrained GPT2Tokenizer
+
+        Args:
+            pretrained_model_name_or_path (Union[str, os.PathLike]): Path to pretrained model
+
+        Examples:
+
+        ```python
+        from transformers import TFGPT2Tokenizer
+
+        tf_tokenizer = TFGPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        ```
+        """
+        tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+        return cls.from_tokenizer(tokenizer, *init_inputs, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        """Creates TFGPT2Tokenizer from configurations
+
+        Args:
+            config (Dict): Dictionary with keys such as stated in `get_config`.
+        """
+        return cls(**config)
+
+    def get_config(self):
+        return {
+            "vocab": self.vocab,
+            "merges": self.merges,
+            "max_length": self.max_length,
+            "pad_token_id": self.pad_token_id,
+        }
+
+    def call(self, x, max_length: Optional[int] = None):
+        input_ids = self.tf_tokenizer(x)
+        attention_mask = tf.ones_like(input_ids)
+
+        if self.pad_token_id is not None:
+            # pad the tokens up to max length
+            max_length = max_length if max_length is not None else self.max_length
+
+            if max_length is not None:
+                input_ids, attention_mask = pad_model_inputs(
+                    input_ids, max_seq_length=max_length, pad_value=self.pad_token_id
+                )
+
+        return {"attention_mask": attention_mask, "input_ids": input_ids}
+
+
+__all__ = ["TFGPT2Tokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__init__.py b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..92e985d92734550a5b0635941294669386d35749
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt_bigcode import *
+    from .modeling_gpt_bigcode import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ffed576e3b1be3e6d277f70672c346240e10aae
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/configuration_gpt_bigcode.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/configuration_gpt_bigcode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ebb29a764722dc36ff18cb43960c56b0aba16b5
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/configuration_gpt_bigcode.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/modeling_gpt_bigcode.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/modeling_gpt_bigcode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b442d5551b763abdee5f1f109c427a88c570108d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/__pycache__/modeling_gpt_bigcode.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
new file mode 100644
index 0000000000000000000000000000000000000000..127a0eed4732c15ef565a306a1a25f86b4e51ce4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright 2023 The BigCode team and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPTBigCode configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTBigCodeConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPTBigCodeModel`]. It is used to instantiate a
+    GPTBigCode model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPTBigCode
+    [gpt_bigcode](https://huggingface.co/gpt_bigcode) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTBigCodeModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new",
+            "gelu_pytorch_tanh"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether to call the fused softmax in float32.
+        scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether to scale the attention softmax in float32.
+        attention_type (`bool`, *optional*, defaults to `True`):
+            Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`).
+    Example:
+
+    ```python
+    >>> from transformers import GPTBigCodeConfig, GPTBigCodeModel
+
+    >>> # Initializing a GPTBigCode configuration
+    >>> configuration = GPTBigCodeConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPTBigCodeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt_bigcode"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_pytorch_tanh",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        attention_softmax_in_fp32=True,
+        scale_attention_softmax_in_fp32=True,
+        multi_query=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
+        self.multi_query = multi_query
+        self.num_key_value_heads = 1 if multi_query else n_head
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+__all__ = ["GPTBigCodeConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f901354a8636f1d48ad42e6e0192f0cc51b74c9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -0,0 +1,935 @@
+# coding=utf-8
+# Copyright 2023 The Bigcode team and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPTBigCode model."""
+
+import math
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import is_flash_attn_available
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...utils import (
+    auto_docstring,
+    can_return_tuple,
+    logging,
+)
+from .configuration_gpt_bigcode import GPTBigCodeConfig
+
+
+if is_flash_attn_available():
+    pass
+
+
+logger = logging.get_logger(__name__)
+
+
+# Fused kernels
+# Use separate functions for each case because conditionals prevent kernel fusion.
+# TODO: Could have better fused kernels depending on scaling, dropout and head mask.
+#  Is it doable without writing 32 functions?
+@torch.jit.script
+def upcast_masked_softmax(
+    x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
+):
+    input_dtype = x.dtype
+    x = x.to(softmax_dtype) * scale
+    x = torch.where(mask, x, mask_value)
+    x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
+    return x
+
+
+@torch.jit.script
+def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
+    input_dtype = x.dtype
+    x = x.to(softmax_dtype) * scale
+    x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
+    return x
+
+
+@torch.jit.script
+def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
+    x = torch.where(mask, x, mask_value)
+    x = torch.nn.functional.softmax(x, dim=-1)
+    return x
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GPTBigCodeAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        self.config = config
+
+        self.mask_value = None
+        self.multi_query = config.multi_query
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.kv_heads = 1 if self.multi_query else self.num_heads
+        self.kv_dim = self.kv_heads * self.head_dim
+        self.num_key_value_groups = self.num_heads // self.kv_heads
+        self.split_size = self.embed_dim
+        self.is_causal = True
+
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.scaling = self.head_dim**-0.5 if config.scale_attn_weights else 1.0
+        self.is_cross_attention = is_cross_attention
+
+        self.layer_idx = layer_idx
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        self.scale_attention_softmax_in_fp32 = (
+            config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
+        )
+        self.attn_pdrop = config.attn_pdrop
+
+        if self.is_cross_attention:
+            if self.multi_query:
+                raise NotImplementedError("Multi-Query Attention not supported for cross_attention")
+
+            self.c_attn = nn.Linear(self.embed_dim, 2 * self.embed_dim)
+            self.q_attn = nn.Linear(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.kv_dim)
+
+        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = config.attn_pdrop
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[
+        tuple[torch.Tensor, Optional[torch.Tensor]],
+        tuple[torch.Tensor, Optional[torch.Tensor], tuple[torch.Tensor, ...]],
+    ]:
+        input_shape = hidden_states.shape[:-1]
+
+        if layer_past is not None:
+            if isinstance(layer_past, EncoderDecoderCache):
+                is_updated = layer_past.is_updated.get(self.layer_idx)
+                if self.is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = layer_past.cross_attention_cache
+                else:
+                    curr_past_key_value = layer_past.self_attention_cache
+            else:
+                curr_past_key_value = layer_past
+
+        if self.is_cross_attention:
+            if not hasattr(self, "q_attn") or not self.is_cross_attention:
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
+                )
+            if layer_past is not None and is_updated:
+                # reuse k,v, cross_attentions
+                key = curr_past_key_value.layers[self.layer_idx].keys
+                value = curr_past_key_value.layers[self.layer_idx].values
+            else:
+                query = self.q_attn(hidden_states).view(*input_shape, -1, self.head_dim).transpose(1, 2)
+                key, value = self.c_attn(encoder_hidden_states).split((self.head_dim, self.head_dim), dim=-1)
+        else:
+            if self.multi_query:
+                query, key, value = (
+                    self.c_attn(hidden_states).unsqueeze(1).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=3)
+                )
+                query = query.view(*input_shape, -1, self.head_dim).transpose(1, 2)
+            else:
+                query, key, value = (
+                    self.c_attn(hidden_states)
+                    .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
+                    .transpose(1, 2)
+                    .split(3 * [self.head_dim], dim=3)
+                )
+
+        if layer_past is not None:
+            # save all key/value_states to cache to be re-used for fast auto-regressive generation
+            cache_position = cache_position if not self.is_cross_attention else None
+            key, value = curr_past_key_value.update(key, value, self.layer_idx, {"cache_position": cache_position})
+            # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+            if self.is_cross_attention:
+                layer_past.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query,
+            key,
+            value,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attn_dropout,
+            scaling=self.scaling,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output, attn_weights
+
+
+class GPTBigCodeMLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, intermediate_size)
+        self.c_proj = nn.Linear(intermediate_size, embed_dim)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
+    def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeBlock(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx)
+
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            if config.multi_query:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
+
+            self.crossattention = GPTBigCodeAttention(config, is_cross_attention=True, layer_idx=layer_idx)
+
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = GPTBigCodeMLP(self.inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.Tensor]],
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[
+        tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[1:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + feed_forward_hidden_states
+        return (hidden_states,) + outputs
+
+
+@auto_docstring
+class GPTBigCodePreTrainedModel(PreTrainedModel):
+    config: GPTBigCodeConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTBigCodeBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (GPTBigCodeMLP, GPTBigCodeAttention)):
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            module.c_proj.weight.data.normal_(
+                mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
+            )
+            module.c_proj._is_hf_initialized = True
+        elif isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class GPTBigCodeModel(GPTBigCodePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.multi_query = config.multi_query
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
+        )
+
+        self.gradient_checkpointing = False
+
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if batch_size <= 0:
+            raise ValueError("batch_size has to be defined and > 0")
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+        )
+
+        if self._use_flash_attention_2:
+            encoder_attention_mask = (
+                encoder_attention_mask.bool()
+                if (encoder_attention_mask is not None and 0 in encoder_attention_mask)
+                else None
+            )
+        else:
+            # If a 2D or 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if (
+                self.config.add_cross_attention
+                and encoder_hidden_states is not None
+                and encoder_attention_mask is not None
+            ):
+                if encoder_attention_mask.dim() == 2:
+                    encoder_attention_mask.unsqueeze(1)
+                assert encoder_attention_mask.dim() == 3
+                encoder_attention_mask = encoder_attention_mask.bool().unsqueeze(2 if self.multi_query else 1)
+            else:
+                encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                past_key_values,
+                causal_mask,
+                head_mask[i],
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[2],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPTBigCodeModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPTBigCode Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTBigCodeForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTBigCodeModel(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = GPTBigCodeModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1).to(logits.device))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPTBigCodeForSequenceClassification",
+    "GPTBigCodeForTokenClassification",
+    "GPTBigCodeForCausalLM",
+    "GPTBigCodeModel",
+    "GPTBigCodePreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/__init__.py b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..578577f22882cdc5eea08928e274a18725cf4615
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt_neo import *
+    from .modeling_flax_gpt_neo import *
+    from .modeling_gpt_neo import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..640ccdc78f02582464fd179cdb2c39f3446c9a27
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/configuration_gpt_neo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/configuration_gpt_neo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbffb7ae864a72c2232684d8f5ec513fd585fcbf
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/configuration_gpt_neo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/modeling_flax_gpt_neo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/modeling_flax_gpt_neo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36f7b7a5b4922bfa02ddd1dff9acd15607562c76
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/modeling_flax_gpt_neo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/modeling_gpt_neo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/modeling_gpt_neo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..072507dca5472e6c313b6322323d18fc6ac33ed4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neo/__pycache__/modeling_gpt_neo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/configuration_gpt_neo.py b/phivenv/Lib/site-packages/transformers/models/gpt_neo/configuration_gpt_neo.py
new file mode 100644
index 0000000000000000000000000000000000000000..875a170277d2048dcadda9cd8f57205a11742797
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -0,0 +1,273 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT Neo model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PreTrainedTokenizer, TensorType, is_torch_available
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfigWithPast
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTNeoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTNeoModel`]. It is used to instantiate a GPT
+    Neo model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the GPTNeo
+    [EleutherAI/gpt-neo-1.3B](https://huggingface.co/EleutherAI/gpt-neo-1.3B) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model. Defines the different
+            tokens that can be represented by the *inputs_ids* passed to the forward method of [`GPTNeoModel`].
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        attention_types (`List`, *optional*, defaults to `[[['global', 'local'], 12]]`):
+            The type of attention for each layer in a `List` of the following format `[[["attention_type"],
+            num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the
+            value of `attention_type` from `["global", "local"]`
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 256):
+            The size of the sliding window for local attention.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        resid_dropout (`float`, *optional*, defaults to 0.0):
+            Residual dropout used in the attention pattern.
+        embed_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        classifier_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. The
+            dropout ratio for the hidden layer.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the end of sentence token in the vocabulary.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPTNeoConfig, GPTNeoModel
+
+    >>> # Initializing a GPTNeo EleutherAI/gpt-neo-1.3B style configuration
+    >>> configuration = GPTNeoConfig()
+
+    >>> # Initializing a model (with random weights) from the EleutherAI/gpt-neo-1.3B style configuration
+    >>> model = GPTNeoModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt_neo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=24,
+        attention_types=[[["global", "local"], 12]],
+        num_heads=16,
+        intermediate_size=None,
+        window_size=256,
+        activation_function="gelu_new",
+        resid_dropout=0.0,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        classifier_dropout=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.intermediate_size = intermediate_size
+        self.window_size = window_size
+        self.activation_function = activation_function
+        self.resid_dropout = resid_dropout
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.classifier_dropout = classifier_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.attention_types = attention_types
+        self.attention_layers = self.expand_attention_types_params(attention_types)
+
+        if len(self.attention_layers) != self.num_layers:
+            raise ValueError(
+                "Configuration for convolutional module is incorrect. "
+                "It is required that `len(config.attention_layers)` == `config.num_layers` "
+                f"but is `len(config.attention_layers) = {len(self.attention_layers)}`, "
+                f"`config.num_layers = {self.num_layers}`. "
+                "`config.attention_layers` is prepared using `config.attention_types`. "
+                "Please verify the value of `config.attention_types` argument."
+            )
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @staticmethod
+    def expand_attention_types_params(attention_types):
+        attentions = []
+        for item in attention_types:
+            for _ in range(item[1]):
+                attentions.extend(item[0])
+        return attentions
+
+
+def custom_unfold(input, dimension, size, step):
+    """Custom torch.Tensor.unfold implementation to enable the export to ONNX."""
+    import torch
+
+    shape = input.size()
+    rank = len(shape)
+    sizedim = shape[dimension]
+
+    low_indices = torch.arange(0, sizedim, step)
+    min_length = torch.div(sizedim - size, step, rounding_mode="floor") + 1
+    indices = torch.arange(size) + low_indices[:min_length][:, None]
+
+    s = [slice(None)] * rank
+    s[dimension] = indices
+    sliced = input[s]
+
+    perm = list(range(0, rank + 1))
+    perm.append(perm.pop(dimension + 1))
+
+    return sliced.permute(perm)
+
+
+def custom_get_block_length_and_num_blocks(seq_length, window_size):
+    """
+    Custom implementation for GPTNeoAttentionMixin._get_block_length_and_num_blocks to enable the export to ONNX as
+    original implementation uses Python variables and control flow.
+    """
+    import torch
+
+    candidates = torch.arange(1, window_size)
+    remainders = torch.remainder(seq_length, candidates)
+    divisor_indices = remainders == 0
+    divisors = candidates[divisor_indices]
+    largest_divisor = torch.max(divisors)
+    return largest_divisor, torch.div(seq_length, largest_divisor, rounding_mode="floor")
+
+
+class GPTNeoOnnxConfig(OnnxConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.num_heads
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                past_shape = (
+                    batch,
+                    self.num_attention_heads,
+                    past_key_values_length,
+                    self._config.hidden_size // self.num_attention_heads,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            mask_dtype = ordered_inputs["attention_mask"].dtype
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
+
+
+__all__ = ["GPTNeoConfig", "GPTNeoOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/modeling_flax_gpt_neo.py b/phivenv/Lib/site-packages/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6cdc50b359b0415fd165b05311eeb9bc07c7526
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
@@ -0,0 +1,687 @@
+# coding=utf-8
+# Copyright 2021 The Eleuther AI and The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_gpt_neo import GPTNeoConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GPTNeoConfig"
+_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
+
+
+GPT_NEO_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+GPT_NEO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxGPTNeoSelfAttention(nn.Module):
+    config: GPTNeoConfig
+    attention_type: str
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and "
+                f"`num_heads`: {self.num_heads})."
+            )
+
+        self.attn_dropout = nn.Dropout(config.attention_dropout)
+        self.resid_dropout = nn.Dropout(config.resid_dropout)
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.q_proj, self.k_proj, self.v_proj = dense(use_bias=False), dense(use_bias=False), dense(use_bias=False)
+        self.out_proj = dense()
+
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+        if self.attention_type == "local":
+            self.causal_mask = self.causal_mask ^ jnp.tril(self.causal_mask, -config.window_size)
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        query = self.q_proj(hidden_states) * jnp.sqrt(self.head_dim).astype(self.dtype)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+
+        query_length, key_length = query.shape[1], key.shape[1]
+
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+        )
+
+        # usual dot product attention
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_dropout,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxGPTNeoAttention(nn.Module):
+    config: GPTNeoConfig
+    layer_id: int = 0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        attention_type = self.config.attention_layers[self.layer_id]
+        self.attention = FlaxGPTNeoSelfAttention(self.config, attention_type, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        return self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+
+
+class FlaxGPTNeoMLP(nn.Module):
+    config: GPTNeoConfig
+    intermediate_size: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+        self.c_fc = nn.Dense(self.intermediate_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.c_proj = nn.Dense(embed_dim, dtype=self.dtype, kernel_init=kernel_init)
+        self.act = ACT2FN[self.config.activation_function]
+        self.dropout = nn.Dropout(rate=self.config.resid_dropout)
+
+    def __call__(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxGPTNeoBlock(nn.Module):
+    config: GPTNeoConfig
+    layer_id: int = 0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        hidden_size = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.attn = FlaxGPTNeoAttention(self.config, layer_id=self.layer_id, dtype=self.dtype)
+        self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.mlp = FlaxGPTNeoMLP(self.config, inner_dim, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = outputs[0]
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        return (hidden_states,) + outputs[1:]
+
+
+class FlaxGPTNeoPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTNeoConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: GPTNeoConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: Optional[dict] = None,
+        past_key_values: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPTNeoAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxGPTNeoBlockCollection(nn.Module):
+    config: GPTNeoConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxGPTNeoBlock(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+            for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = block(
+                hidden_states,
+                attention_mask,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxGPTNeoModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+class FlaxGPTNeoModule(nn.Module):
+    config: GPTNeoConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
+        self.wte = nn.Embed(
+            self.config.vocab_size,
+            self.embed_dim,
+            embedding_init=embedding_init,
+        )
+        self.wpe = nn.Embed(
+            self.config.max_position_embeddings,
+            self.embed_dim,
+            embedding_init=embedding_init,
+        )
+        self.dropout = nn.Dropout(rate=self.config.embed_dropout)
+        self.h = FlaxGPTNeoBlockCollection(self.config, dtype=self.dtype)
+        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.wte(input_ids.astype("i4"))
+        position_embeds = self.wpe(position_ids.astype("i4"))
+
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+
+        outputs = self.h(
+            hidden_states,
+            attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare GPTNeo Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT_NEO_START_DOCSTRING,
+)
+class FlaxGPTNeoModel(FlaxGPTNeoPreTrainedModel):
+    module_class = FlaxGPTNeoModule
+
+
+append_call_sample_docstring(FlaxGPTNeoModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxGPTNeoForCausalLMModule(nn.Module):
+    config: GPTNeoConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.transformer = FlaxGPTNeoModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask,
+            position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The GPTNeo Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT_NEO_START_DOCSTRING,
+)
+class FlaxGPTNeoForCausalLM(FlaxGPTNeoPreTrainedModel):
+    module_class = FlaxGPTNeoForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPTNeo uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(FlaxGPTNeoForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
+
+
+__all__ = ["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neo/modeling_gpt_neo.py b/phivenv/Lib/site-packages/transformers/models/gpt_neo/modeling_gpt_neo.py
new file mode 100644
index 0000000000000000000000000000000000000000..9622620ca37635f2d646f730da5a3adda8bd0f9f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -0,0 +1,1197 @@
+# coding=utf-8
+# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPT Neo model."""
+
+import os
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    auto_docstring,
+    is_torch_flex_attn_available,
+    logging,
+)
+from .configuration_gpt_neo import GPTNeoConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_tf_weights_in_gpt_neo(model, config, gpt_neo_checkpoint_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import re
+
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(gpt_neo_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        if "global_step" not in name and "adam" not in name:
+            array = tf.train.load_variable(tf_path, name)
+            array = tf.dtypes.cast(array.squeeze(), tf.float32).numpy()
+            name = name.replace("attn/q", "attn/attention/q_proj/w")
+            name = name.replace("attn/k", "attn/attention/k_proj/w")
+            name = name.replace("attn/v", "attn/attention/v_proj/w")
+            name = name.replace("attn/o", "attn/attention/out_proj/w")
+            name = name.replace("norm_1", "ln_1")
+            name = name.replace("norm_2", "ln_2")
+            name = name.replace("attn/compute_output_bias/o_b", "attn/attention/out_proj/b")
+            name = name.replace("conv1d_main/c_fc/kernel", "c_fc/w")
+            name = name.replace("conv1d_main/c_fc/bias", "c_fc/b")
+            name = name.replace("conv1d_main/c_proj/kernel", "c_proj/w")
+            name = name.replace("conv1d_main/c_proj/bias", "c_proj/b")
+
+            names.append(name)
+            arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name[5:]  # skip "gpt2/"
+        name = name.split("/")
+        pointer = model.transformer
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+
+        if name[-1] == "w" and name[-2] in ["out_proj", "k_proj", "q_proj", "v_proj", "c_proj", "c_fc"]:
+            array = array.transpose()
+
+        if name == ["wte"]:
+            # if vocab is padded, then trim off the padding embeddings
+            array = array[: config.vocab_size]
+
+        if pointer.shape != array.shape:
+            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched {name}")
+
+        print(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+
+    # init the final linear layer using word embeddings
+    embs = model.transformer.wte.weight
+    lin = nn.Linear(embs.size()[1], embs.size()[0], bias=False)
+    lin.weight = embs
+    model.set_output_embeddings(lin)
+    return model
+
+
+class GPTNeoSelfAttention(nn.Module):
+    def __init__(self, config, attention_type, layer_id=None):
+        super().__init__()
+        self.config = config
+
+        max_positions = config.max_position_embeddings
+        bias = torch.tril(torch.ones((max_positions, max_positions), dtype=bool)).view(
+            1, 1, max_positions, max_positions
+        )
+
+        # local causal self attention is a sliding window where each token can only attend to the previous
+        # window_size tokens. This is implemented by updating the causal mask such that for each token
+        # all other tokens are masked except the previous window_size tokens.
+        if attention_type == "local":
+            bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))
+
+        self.register_buffer("bias", bias, persistent=False)
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
+
+        self.attn_dropout = nn.Dropout(float(config.attention_dropout))
+        self.resid_dropout = nn.Dropout(float(config.resid_dropout))
+        self.is_causal = True
+        self.layer_id = layer_id
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = query.to(torch.float32)
+        key = key.to(torch.float32)
+
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        # Apply sliding window masking for local attention layers
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+        mask_value = torch.finfo(attn_weights.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_past=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        if layer_past is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            key, value = layer_past.update(key, value, self.layer_id, cache_kwargs)
+
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights
+
+
+class GPTNeoFlashAttention2(GPTNeoSelfAttention):
+    """
+    GPTNeo flash attention module. This module inherits from `GPTNeoSelfAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_past=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        bsz, _, _ = hidden_states.size()
+
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        if layer_past is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            key, value = layer_past.update(key, value, self.layer_id, cache_kwargs)
+
+        query_length = query.shape[2]
+        tgt_len = key.shape[2]
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        query = query.transpose(1, 2).view(bsz, query_length, self.num_heads, self.head_dim)
+        key = key.transpose(1, 2).view(bsz, tgt_len, self.num_heads, self.head_dim)
+        value = value.transpose(1, 2).view(bsz, tgt_len, self.num_heads, self.head_dim)
+
+        attn_dropout = self.config.attention_dropout if self.training else 0.0
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        device_type = query.device.type if query.device.type != "mps" else "cpu"
+        if query.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query = query.to(target_dtype)
+            key = key.to(target_dtype)
+            value = value.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query,
+            key,
+            value,
+            attention_mask,
+            query_length,
+            dropout=attn_dropout,
+            softmax_scale=1.0,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_weights_reshaped = attn_output.reshape(bsz, query_length, self.num_heads * self.head_dim)
+        attn_output = self.out_proj(attn_weights_reshaped)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+GPT_NEO_ATTENTION_CLASSES = {
+    "eager": GPTNeoSelfAttention,
+    "flash_attention_2": GPTNeoFlashAttention2,
+}
+
+
+class GPTNeoAttention(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.layer_id = layer_id
+        self.attention_layers = config.attention_layers
+        self.attention_type = self.attention_layers[layer_id]
+
+        if self.attention_type in ["global", "local"]:
+            self.attention = GPT_NEO_ATTENTION_CLASSES[config._attn_implementation](
+                config, self.attention_type, layer_id
+            )
+        else:
+            raise NotImplementedError(
+                "Only attn layer types 'global' and 'local' exist, but got `config.attention_layers`: "
+                f"{config.attention_layers}. Select attn layer types from ['global', 'local'] only."
+            )
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        return self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+
+
+class GPTNeoMLP(nn.Module):
+    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * hidden_size
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, intermediate_size)
+        self.c_proj = nn.Linear(intermediate_size, embed_dim)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(float(config.resid_dropout))
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPTNeoBlock(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPTNeoAttention(config, layer_id)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPTNeoMLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output, attn_weights = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        return hidden_states, attn_weights
+
+
+@auto_docstring
+class GPTNeoPreTrainedModel(PreTrainedModel):
+    config: GPTNeoConfig
+    load_tf_weights = load_tf_weights_in_gpt_neo
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTNeoBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _can_compile_fullgraph = False  # TODO: needs a hybrid cache
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class GPTNeoModel(GPTNeoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.drop = nn.Dropout(float(config.embed_dropout))
+        self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        seq_length = inputs_embeds.shape[1]
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, seq_length)
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+        output_shape = (-1, seq_length, hidden_states.size(-1))
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                layer_past=past_key_values,
+                attention_mask=causal_mask,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT Neo Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class GPTNeoForCausalLM(GPTNeoPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPTNeoModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Compute loss in fp32 to match with mesh-tf version
+            # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Flatten the tokens
+            loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPTNeo Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTNeoForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTNeoModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPTNeoForTokenClassification(GPTNeoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = GPTNeoModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTNeoModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPTNeoForCausalLM",
+    "GPTNeoForQuestionAnswering",
+    "GPTNeoForSequenceClassification",
+    "GPTNeoForTokenClassification",
+    "GPTNeoModel",
+    "GPTNeoPreTrainedModel",
+    "load_tf_weights_in_gpt_neo",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/__init__.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdf2639019963511e7fed587f636aad5edf96ee9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt_neox import *
+    from .modeling_gpt_neox import *
+    from .tokenization_gpt_neox_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8eee72c04c41e95deecf9fd5842a8eae10d00932
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..152c9a38b80d7614b232016fa0c7e21639f862f2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/modeling_gpt_neox.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/modeling_gpt_neox.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87dfd0f9ee5594aea8ed829b56b4c571f9096346
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/modeling_gpt_neox.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ca9db7f92d9f4477e839d5bfe149c534a391bd4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecfe68c2cb2f45b6d61de232c0ad8281c4911991
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/configuration_gpt_neox.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox/configuration_gpt_neox.py
new file mode 100644
index 0000000000000000000000000000000000000000..80323914db35c9dbad865e54b8f8e82ab6a6ace4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPTNeoX model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTNeoXConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an
+    GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the GPTNeoX
+    [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50432):
+            Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTNeoXModel`].
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 44):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        rotary_emb_base (`int`, *optional*, defaults to 10000)
+            base for computing rotary embeddings frequency
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio probability of the attention score.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio of (1) the word embeddings, (2) the post-attention hidden states, and (3) the post-mlp
+            hidden states.
+        classifier_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`].
+
+            The dropout ratio for the hidden layer.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 1e-5):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        use_parallel_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
+            speedup at large scales (e.g. 20B).
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+
+        Example:
+
+    ```python
+    >>> from transformers import GPTNeoXConfig, GPTNeoXModel
+
+    >>> # Initializing a GPTNeoX gpt-neox-20b style configuration
+    >>> configuration = GPTNeoXConfig()
+
+    >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration
+    >>> model = GPTNeoXModel(configuration)  # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config  # doctest: +SKIP
+    ```"""
+
+    model_type = "gpt_neox"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.attention.query_key_value": "colwise",
+        "layers.*.attention.dense": "rowwise",
+        "layers.*.mlp.dense_h_to_4h": "colwise",
+        "layers.*.mlp.dense_4h_to_h": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_in": (["input_ids"], ["inputs_embeds"]),
+        "emb_dropout": (["inputs_embeds"], ["hidden_states"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "final_layer_norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=50432,
+        hidden_size=6144,
+        num_hidden_layers=44,
+        num_attention_heads=64,
+        intermediate_size=24576,
+        hidden_act="gelu",
+        rotary_pct=0.25,
+        rotary_emb_base=10000,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,
+        classifier_dropout=0.1,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_parallel_residual=True,
+        rope_scaling=None,
+        attention_bias=True,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.rotary_pct = rotary_pct
+        self.partial_rotary_factor = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.rope_theta = rotary_emb_base
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.use_parallel_residual = use_parallel_residual
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisible by the number of attention heads! Make sure to update them!"
+            )
+
+
+__all__ = ["GPTNeoXConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py
new file mode 100644
index 0000000000000000000000000000000000000000..4125946e120663239c80a24e694ab1277a08b146
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -0,0 +1,832 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gpt_neox/modular_gpt_neox.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gpt_neox.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_gpt_neox import GPTNeoXConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTNeoXMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor,
+    scaling: float,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+
+    # Reshape outputs
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GPTNeoXAttention(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.attention_dropout = config.attention_dropout
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        self.scaling = self.head_size**-0.5
+        self.is_causal = True
+        self.layer_idx = layer_idx
+
+        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.attention_bias)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ):
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, 3 * self.head_size)
+
+        qkv = self.query_key_value(hidden_states).view(hidden_shape).transpose(1, 2)
+        query_states, key_states, value_states = qkv.chunk(3, dim=-1)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        # Cache QKV values
+        if layer_past is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_ndims,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = layer_past.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        # Compute attention
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        # Reshape outputs and final projection
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.dense(attn_output)
+
+        return attn_output, attn_weights
+
+
+class GPTNeoXLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
+        self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
+        self.attention = GPTNeoXAttention(config, layer_idx)
+        self.mlp = GPTNeoXMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        layer_past: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ):
+        attn_output, attn_weights = self.attention(
+            self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        attn_output = self.post_attention_dropout(attn_output)
+
+        if self.use_parallel_residual:
+            # pseudocode:
+            # x = x + attn(ln1(x)) + mlp(ln2(x))
+            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            mlp_output = self.post_mlp_dropout(mlp_output)
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            # pseudocode:
+            # x = x + attn(ln1(x))
+            # x = x + mlp(ln2(x))
+            attn_output = attn_output + hidden_states
+            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            mlp_output = self.post_mlp_dropout(mlp_output)
+            hidden_states = mlp_output + attn_output
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class GPTNeoXRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GPTNeoXConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class GPTNeoXRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GPTNeoXRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GPTNeoXDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GPTNeoXConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GPTNeoXAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = GPTNeoXMLP(config)
+        self.input_layernorm = GPTNeoXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GPTNeoXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class GPTNeoXPreTrainedModel(PreTrainedModel):
+    config: GPTNeoXConfig
+    base_model_prefix = "gpt_neox"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTNeoXLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": GPTNeoXDecoderLayer,
+        "attentions": GPTNeoXAttention,
+    }
+    _keys_to_ignore_on_load_unexpected = [r"attention.bias", r"attention.masked_bias"]
+
+
+@auto_docstring
+class GPTNeoXModel(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.emb_dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([GPTNeoXLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.rotary_emb = GPTNeoXRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        converted_head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        # Flex Attention converts it to a separate mask
+        if head_mask is not None:
+            converted_head_mask = ~converted_head_mask.bool() * torch.finfo(inputs_embeds.dtype).min
+            converted_head_mask = converted_head_mask.to(dtype=self.dtype, device=self.device)
+        head_mask = converted_head_mask
+
+        hidden_states = self.emb_dropout(inputs_embeds)
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                layer_past=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_in
+
+    def set_input_embeddings(self, value):
+        self.embed_in = value
+
+
+@auto_docstring(
+    custom_intro="""
+    GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["embed_out.weight"]
+    _tp_plan = {"embed_out": "colwise_rep"}
+    _pp_plan = {"embed_out": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.gpt_neox = GPTNeoXModel(config)
+        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.embed_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.embed_out = new_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config.is_decoder = True
+        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.embed_out(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPTNeoX Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTNeoXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt_neox = GPTNeoXModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        batch_size = logits.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.gpt_neox = GPTNeoXModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> TokenClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt_neox = GPTNeoXModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> QuestionAnsweringModelOutput:
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output = outputs.last_hidden_state
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions)
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPTNeoXForCausalLM",
+    "GPTNeoXForQuestionAnswering",
+    "GPTNeoXForSequenceClassification",
+    "GPTNeoXForTokenClassification",
+    "GPTNeoXLayer",
+    "GPTNeoXModel",
+    "GPTNeoXPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/modular_gpt_neox.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox/modular_gpt_neox.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e8f182f9d019271e6d5d932ad750ead6e029a1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox/modular_gpt_neox.py
@@ -0,0 +1,704 @@
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..llama.modeling_llama import LlamaModel, LlamaPreTrainedModel, LlamaRotaryEmbedding, rotate_half
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTNeoXMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor,
+    scaling: float,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+
+    # Reshape outputs
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GPTNeoXAttention(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.attention_dropout = config.attention_dropout
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        self.scaling = self.head_size**-0.5
+        self.is_causal = True
+        self.layer_idx = layer_idx
+
+        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.attention_bias)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ):
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, 3 * self.head_size)
+
+        qkv = self.query_key_value(hidden_states).view(hidden_shape).transpose(1, 2)
+        query_states, key_states, value_states = qkv.chunk(3, dim=-1)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        # Cache QKV values
+        if layer_past is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_ndims,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = layer_past.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        # Compute attention
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        # Reshape outputs and final projection
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.dense(attn_output)
+
+        return attn_output, attn_weights
+
+
+class GPTNeoXLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
+        self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
+        self.attention = GPTNeoXAttention(config, layer_idx)
+        self.mlp = GPTNeoXMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        layer_past: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ):
+        attn_output, attn_weights = self.attention(
+            self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        attn_output = self.post_attention_dropout(attn_output)
+
+        if self.use_parallel_residual:
+            # pseudocode:
+            # x = x + attn(ln1(x)) + mlp(ln2(x))
+            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            mlp_output = self.post_mlp_dropout(mlp_output)
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            # pseudocode:
+            # x = x + attn(ln1(x))
+            # x = x + mlp(ln2(x))
+            attn_output = attn_output + hidden_states
+            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            mlp_output = self.post_mlp_dropout(mlp_output)
+            hidden_states = mlp_output + attn_output
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class GPTNeoXRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class GPTNeoXPreTrainedModel(LlamaPreTrainedModel):
+    base_model_prefix = "gpt_neox"
+    _no_split_modules = ["GPTNeoXLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"attention.bias", r"attention.masked_bias"]
+
+
+GPT_NEOX_START_DOCSTRING = None  # Will be picked up by modular
+GPT_NEOX_INPUTS_DOCSTRING = None  # Will be picked up by modular
+
+
+class GPTNeoXModel(LlamaModel):
+    def __init__(self, config):
+        PreTrainedModel.__init__(self, config)
+        self.config = config
+
+        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.emb_dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([GPTNeoXLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.rotary_emb = GPTNeoXRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_in
+
+    def set_input_embeddings(self, value):
+        self.embed_in = value
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        converted_head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        # Flex Attention converts it to a separate mask
+        if head_mask is not None:
+            converted_head_mask = ~converted_head_mask.bool() * torch.finfo(inputs_embeds.dtype).min
+            converted_head_mask = converted_head_mask.to(dtype=self.dtype, device=self.device)
+        head_mask = converted_head_mask
+
+        hidden_states = self.emb_dropout(inputs_embeds)
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                layer_past=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["embed_out.weight"]
+    _tp_plan = {"embed_out": "colwise_rep"}
+    _pp_plan = {"embed_out": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.gpt_neox = GPTNeoXModel(config)
+        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.embed_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.embed_out = new_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config.is_decoder = True
+        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.embed_out(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPTNeoX Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTNeoXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt_neox = GPTNeoXModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        batch_size = logits.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.gpt_neox = GPTNeoXModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> TokenClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt_neox = GPTNeoXModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> QuestionAnsweringModelOutput:
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output = outputs.last_hidden_state
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions)
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPTNeoXForCausalLM",
+    "GPTNeoXForQuestionAnswering",
+    "GPTNeoXForSequenceClassification",
+    "GPTNeoXForTokenClassification",
+    "GPTNeoXLayer",
+    "GPTNeoXModel",
+    "GPTNeoXPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b190a60eb1202a4b7dc7c82692edf22d72000b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -0,0 +1,224 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for GPTNeoX."""
+
+from typing import Optional
+
+from tokenizers import processors
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import GPTNeoXTokenizerFast
+
+    >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("openai-community/gpt2")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+        pad_token (`str`, *optional*):
+            Token for padding a sequence.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
+        add_bos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token=None,
+        add_bos_token=False,
+        add_eos_token=False,
+        add_prefix_space=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["GPTNeoXTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__init__.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ba39d69ad638c706f6ac8491e2dea80e269929
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt_neox_japanese import *
+    from .modeling_gpt_neox_japanese import *
+    from .tokenization_gpt_neox_japanese import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60daae758dcd2e0293b77fd0756adf62661f694f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad1cca4a29195c0ee2a287285d7998575656d771
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7cc86ac138ba89b62068d164201028a7a47617d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5db7d7af61115f0ff3db4b0d16ec4fb28e7faefe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..320157334539b1d7c418c8cf97c8b57dc38629f7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPTNeoX Japanese model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTNeoXJapaneseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTNeoXModelJapanese`]. It is used to instantiate
+    a GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPTNeoXJapanese
+    [abeja/gpt-neox-japanese-2.7b](https://huggingface.co/abeja/gpt-neox-japanese-2.7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Default configs is set as 2.7B model
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the GPTNeoXJapanese model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`GPTNeoXJapanese`].
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_multiple_size (`int`, *optional*, defaults to 4):
+            Dimension of the "intermediate" layer in the Transformer encoder is calculated by hidden_size *
+            intermediate_multiple_size.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        rotary_pct (`float`, *optional*, defaults to 1.00):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        rotary_emb_base (`int`, *optional*, defaults to 10000)
+            base for computing rotary embeddings frequency
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden layer.
+        Example:
+
+    ```python
+    >>> from transformers import GPTNeoXJapaneseConfig, GPTNeoXJapaneseModel
+
+    >>> # Initializing a GPTNeoXJapanese gpt-neox-japanese-2.7b style configuration
+    >>> configuration = GPTNeoXJapaneseConfig()
+
+    >>> # Initializing a model (with random weights) from the gpt-neox-japanese-2.7b style configuration
+    >>> model = GPTNeoXJapaneseModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt_neox_japanese"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2560,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        intermediate_multiple_size=4,
+        hidden_act="gelu",
+        rotary_pct=1.00,
+        rotary_emb_base=10000,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=31996,
+        eos_token_id=31999,
+        rope_scaling=None,
+        attention_dropout=0.1,
+        hidden_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_multiple_size = intermediate_multiple_size
+        self.hidden_act = hidden_act
+        self.rotary_pct = rotary_pct
+        self.partial_rotary_factor = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.rope_theta = rotary_emb_base
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+
+__all__ = ["GPTNeoXJapaneseConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..108a68ef70d1d0dfe4de0bdcd95d8807d4f6f27d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -0,0 +1,760 @@
+# coding=utf-8
+# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPTNeoX model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
+from .configuration_gpt_neox_japanese import GPTNeoXJapaneseConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class GPTNeoXJapanesePreTrainedModel(PreTrainedModel):
+    config: GPTNeoXJapaneseConfig
+    base_model_prefix = "gpt_neox_japanese"
+    _no_split_modules = ["GPTNeoXJapaneseLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    _can_compile_fullgraph = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GPTNeoXJapaneseAttention):
+            if module.dense_bias is not None:
+                module.dense_bias.data.zero_()
+
+
+class GPTNeoXJapaneseAttention(nn.Module):
+    def __init__(self, config, use_bias=False, layer_idx=None):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_attention_heads
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.layer_idx = layer_idx
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        self.rope_theta = config.rotary_emb_base
+        self.rotary_emb = GPTNeoXJapaneseRotaryEmbedding(config=config)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.norm_factor = math.sqrt(self.head_size)
+
+        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=False)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        # Activate bias if the last layer
+        self.use_bias = use_bias
+        self.dense_bias = nn.Parameter(torch.zeros(config.hidden_size)) if use_bias else None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        position_ids: torch.LongTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        # Compute QKV
+        # Attention heads [batch, seq_len, hidden_size]
+        #   --> [batch, seq_len, (np * 3 * head_size)]
+        qkv = self.query_key_value(hidden_states)
+
+        # [batch, seq_len, (num_heads * 3 * head_size)]
+        #   --> [batch, seq_len, num_heads, 3 * head_size]
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape)
+
+        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
+        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+        key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
+        value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        query_pass = query[..., self.rotary_ndims :]
+        key_rot = key[..., : self.rotary_ndims]
+        key_pass = key[..., self.rotary_ndims :]
+
+        cos, sin = position_embeddings
+        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin)
+        query = torch.cat((query, query_pass), dim=-1).contiguous()
+        key = torch.cat((key, key_pass), dim=-1).contiguous()
+
+        # Cache QKV values
+        if layer_past is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_ndims,
+                "cache_position": cache_position,
+            }
+            key, value = layer_past.update(key, value, self.layer_idx, cache_kwargs)
+
+        # Compute attention
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        # Reshape outputs
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
+        attn_output = self.dense(attn_output)
+
+        return attn_output, attn_weights, self.dense_bias
+
+    @classmethod
+    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        # tensor: [bs, seq_len, hidden_size]
+        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
+        # -> [bs, seq_len, num_attention_heads, attn_head_size]
+        tensor = tensor.view(new_shape)
+        # -> [bs, num_attention_heads, seq_len, attn_head_size]
+        tensor = tensor.permute(0, 2, 1, 3)
+        return tensor
+
+    @classmethod
+    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        # tensor [bs, num_attention_heads, seq_len, attn_head_size]
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        # -> [bs, seq_len, num_attention_heads, attn_head_size]
+        tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
+        # -> [bs, seq_len, hidden_size]
+        return tensor
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
+        key_length = key.size(-2)
+
+        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
+        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
+
+        # [batch_size * num_heads, q_length, kv_length]
+        attn_scores = torch.zeros(
+            batch_size * num_attention_heads,
+            query_length,
+            key_length,
+            dtype=query.dtype,
+            device=key.device,
+        )
+        attention_scores = torch.baddbmm(
+            attn_scores,
+            query,
+            key.transpose(1, 2),
+            beta=1.0,
+            alpha=1.0 / self.norm_factor,
+        )
+
+        attention_scores = attention_scores.view(batch_size, num_attention_heads, query_length, -1)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+            attention_scores = attention_scores + causal_mask
+
+        attn_weights = nn.functional.softmax(attention_scores, dim=-1)
+        attn_weights = self.attention_dropout(attn_weights)
+        attn_weights = attn_weights.to(value.dtype)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoX->GPTNeoXJapanese
+class GPTNeoXJapaneseRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GPTNeoXJapaneseConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def bias_dropout_add(x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool) -> Tensor:
+    """add bias to x, apply dropout and residual connection
+
+    Args:
+        x (Tensor): main path of output
+        bias (Tensor): None or attn_bias of the last attention layer
+        residual (Optional[Tensor]): residual value
+        prob (float): dropout probability
+        training (bool): whether in training mode or not
+
+    Returns:
+        Tensor: dropout(x + bias) + residual
+    """
+    if bias is not None:
+        x = x + bias
+    out = torch.nn.functional.dropout(x, p=prob, training=training)
+    if residual is not None:
+        out = residual + out
+    return out
+
+
+class GPTNeoXJapaneseMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        intermediate_size = int(config.hidden_size * config.intermediate_multiple_size)
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(intermediate_size, config.hidden_size, bias=False)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        intermediate = self.dense_h_to_4h(hidden_states)
+        intermediate = self.act(intermediate)
+        output = self.dense_4h_to_h(intermediate)
+        return output
+
+
+class GPTNeoXJapaneseLayer(nn.Module):
+    def __init__(self, config, layer_number):
+        super().__init__()
+        self.layer_number = layer_number
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # activate bias only last layer
+        self.attention = GPTNeoXJapaneseAttention(
+            config=config, use_bias=layer_number == config.num_hidden_layers - 1, layer_idx=layer_number
+        )
+        self.mlp = GPTNeoXJapaneseMLP(config)
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        layer_past: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        residual = hidden_states
+        ln_out = self.input_layernorm(hidden_states)
+        attn_output, attn_weights, attn_bias = self.attention(
+            ln_out,
+            attention_mask=attention_mask,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+
+        # attn_output = (atten_output + bias) + residual
+        attn_output = bias_dropout_add(
+            attn_output,
+            bias=attn_bias.expand_as(residual) if attn_bias is not None else attn_bias,
+            residual=residual,
+            prob=self.hidden_dropout,
+            training=self.training,
+        )
+        mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+
+        # attn_output = (mlp_output + mlp_bias) + atten_output
+        attn_output = bias_dropout_add(
+            mlp_output, bias=None, residual=attn_output, prob=self.hidden_dropout, training=self.training
+        )
+
+        return attn_output, attn_weights
+
+
+@auto_docstring
+class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [GPTNeoXJapaneseLayer(config=config, layer_number=i) for i in range(config.num_hidden_layers)]
+        )
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.rotary_emb = GPTNeoXJapaneseRotaryEmbedding(config=config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_in
+
+    def set_input_embeddings(self, value):
+        self.embed_in = value
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXJapaneseModel
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> model = GPTNeoXJapaneseModel.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+
+        >>> inputs = tokenizer("日本語のGPT-neoxがHugging Faceで使えます😀", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        seq_length = inputs_embeds.shape[1]
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                layer_past=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    GPTNeoXJapanese Model with a `language modeling` head on top for Classifier Model fine-tuning.
+    """
+)
+class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["embed_out.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.gpt_neox_japanese = GPTNeoXJapaneseModel(config)
+        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.embed_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.embed_out = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> config = GPTNeoXJapaneseConfig.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> config.is_decoder = True
+        >>> model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b", config=config)
+
+        >>> inputs = tokenizer("日本語のGPT-neoxがHugging Faceで使えます😀", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox_japanese(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits = self.embed_out(hidden_states)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+
+            lm_loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPTNeoXJapaneseForCausalLM",
+    "GPTNeoXJapaneseLayer",
+    "GPTNeoXJapaneseModel",
+    "GPTNeoXJapanesePreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..891f77ece304f8749fcd42d33661fa04bab6ebaa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -0,0 +1,369 @@
+# coding=utf-8
+# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for GPTNeoXJapanese."""
+
+import collections
+import json
+import os
+import re
+import sys
+from typing import Optional
+
+import numpy as np
+
+from ...tokenization_utils_fast import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
+
+
+def load_vocab_and_emoji(vocab_file, emoji_file):
+    """Loads a vocabulary file and emoji file into a dictionary."""
+    with open(emoji_file, "r", encoding="utf-8") as f:
+        emoji = json.loads(f.read())
+
+    vocab = collections.OrderedDict()
+    raw_vocab = collections.OrderedDict()
+    ids_to_tokens = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as f:
+        token = f.readlines()
+    token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token]
+    for idx, b in enumerate(token):
+        ids_to_tokens[idx] = b
+        raw_vocab[",".join(b)] = idx
+        for wd in b:
+            vocab[wd] = idx
+
+    return vocab, raw_vocab, ids_to_tokens, emoji
+
+
+class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
+    """
+    This tokenizer inherits from [`PreTrainedTokenizer`] and is based on Japanese special Sub-Word-Encoding that is
+    used in this repository (https://github.com/tanreinama/Japanese-BPEEncoder_V2). Check the repository for details.
+    Japanese has a relatively large vocabulary and there is no separation between words. Furthermore, the language is a
+    combination of hiragana, katakana, and kanji, and variants such as "1" and "①" are often used. In order to cope
+    with these, this tokenizer has the following features
+    - Subword-by-subword segmentation, which is intermediate between byte strings and morphological analysis.
+    - BPEs are created for each Kanji, Hiragana, and Katakana character, and there are no BPEs that cross character
+        types, such as Kanji + Hiragana or Hiragana + Katakana.
+    - All-byte encoding that does not require <unk>.
+    - Independent of UTF codes such as 2-byte and 3-byte characters
+    - Conversion of heterographs to the same token_id
+    - Emoji and Emoticon are grouped into 12 types as special tags.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPTNeoXJapaneseTokenizer
+
+    >>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+    >>> # You can confirm both 慶応 and 慶應 are encoded to 17749
+    >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
+    [30014, 26883, 26638, 27228, 25, 26650, 31732, 31679, 27809, 26638, 17749, 31592, 17749, 31593, 321, 1281]
+
+    >>> # Both 慶応 and 慶應 are decoded to 慶応
+    >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
+    '吾輩は猫である🐯。実は慶応(慶応)大学出身'
+    ```
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        emoji_file (`str`):
+            File containing the emoji.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        do_clean_text (`bool`, *optional*, defaults to `False`):
+            Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        emoji_file,
+        unk_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        do_clean_text=False,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        if not os.path.isfile(emoji_file):
+            raise ValueError(
+                f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google"
+                " pretrained model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_clean_text = do_clean_text
+        self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
+        self.subword_tokenizer = SubWordJapaneseTokenizer(
+            vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
+        )
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            do_clean_text=do_clean_text,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
+        return len(self.raw_vocab)
+
+    def get_vocab(self):
+        return dict(self.raw_vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.subword_tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        else:
+            vocab_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token_index, token in self.ids_to_tokens.items():
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(",".join(token) + "\n")
+                index += 1
+        with open(emoji_file, "w", encoding="utf-8") as writer:
+            json.dump(self.emoji, writer)
+        return vocab_file, emoji_file
+
+
+class SubWordJapaneseTokenizer:
+    """
+    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
+    original repository.
+
+    MIT License
+
+    Copyright (c) 2020 tanreinama
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+    the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+    THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+    """
+
+    def __init__(self, vocab, ids_to_tokens, emoji):
+        self.vocab = vocab  # same as swe
+        self.ids_to_tokens = ids_to_tokens  # same as bpe
+        self.emoji = emoji
+        self.maxlen = np.max([len(w) for w in self.vocab])
+        self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
+        self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
+        self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
+        self.content_repatter4 = re.compile(
+            r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        self.content_repatter5 = re.compile(
+            r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
+        # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
+        # different regex that should generally have the same behaviour in most non-pathological cases.
+        if sys.version_info >= (3, 11):
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億])*+"
+                r"(?:\d,\d{3}|[\d万])*+"
+                r"(?:\d,\d{3}|[\d千])*+"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        else:
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億万千])*"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
+        blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
+        self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
+
+    def __len__(self):
+        return len(self.ids_to_tokens)
+
+    def clean_text(self, content):
+        content = self.content_repatter1.sub("<URL>", content)
+        content = self.content_repatter2.sub("<EMAIL>", content)
+        content = self.content_repatter3.sub("<TEL>", content)
+        content = self.content_repatter4.sub("<DATE>", content)
+        content = self.content_repatter5.sub("<DATE>", content)
+        content = self.content_repatter6.sub("<PRICE>", content)
+        content = content.translate(self.content_trans1)
+        while "<BLOCK><BLOCK>" in content:
+            content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
+        return content
+
+    def tokenize(self, text, clean=False):
+        text = text.replace(" ", "<SP>")
+        text = text.replace("　", "<SP>")
+        text = text.replace("\r\n", "<BR>")
+        text = text.replace("\n", "<BR>")
+        text = text.replace("\r", "<BR>")
+        text = text.replace("\t", "<TAB>")
+        text = text.replace("—", "ー")
+        text = text.replace("−", "ー")
+        for k, v in self.emoji["emoji"].items():
+            if k in text:
+                text = text.replace(k, v)
+        if clean:
+            text = self.clean_text(text)
+
+        def check_simbol(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 2:
+                c = (int(e[0]) << 8) + int(e[1])
+                if (
+                    (c >= 0xC2A1 and c <= 0xC2BF)
+                    or (c >= 0xC780 and c <= 0xC783)
+                    or (c >= 0xCAB9 and c <= 0xCBBF)
+                    or (c >= 0xCC80 and c <= 0xCDA2)
+                ):
+                    return True
+            return False
+
+        def checku2e(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 3:
+                c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
+                if c >= 0xE28080 and c <= 0xE2B07F:
+                    return True
+            return False
+
+        pos = 0
+        result = []
+        while pos < len(text):
+            end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
+            candidates = []  # (token_id, token, pos)
+            for e in range(end, pos, -1):
+                wd = text[pos:e]
+                if wd in self.vocab:
+                    if wd[0] == "<" and len(wd) > 2:
+                        candidates = [(self.vocab[wd], wd, e)]
+                        break
+                    else:
+                        candidates.append((self.vocab[wd], wd, e))
+            if len(candidates) > 0:
+                # the smallest token_id is adopted
+                _, wd, e = sorted(candidates, key=lambda x: x[0])[0]
+                result.append(wd)
+                pos = e
+            else:
+                end = pos + 1
+                wd = text[pos:end]
+                if check_simbol(wd):
+                    result.append("<KIGOU>")
+                elif checku2e(wd):
+                    result.append("<U2000U2BFF>")
+                else:
+                    for i in wd.encode("utf-8"):
+                        result.append("<|byte%d|>" % i)
+                pos = end
+        return result
+
+    def convert_id_to_token(self, index, breakline="\n"):
+        words = []
+        byte_tokens = []
+        word = self.ids_to_tokens[index][0]
+        if word[:6] == "<|byte" and word[-2:] == "|>":
+            byte_tokens.append(int(word[6:-2]))
+        else:
+            if len(byte_tokens) > 0:
+                words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+                byte_tokens = []
+            if word[:7] == "<|emoji" and word[-2:] == "|>":
+                words.append(self.emoji["emoji_inv"][word])
+            elif word == "<SP>":
+                words.append(" ")
+            elif word == "<BR>":
+                words.append(breakline)
+            elif word == "<TAB>":
+                words.append("\t")
+            elif word == "<BLOCK>":
+                words.append("▀")
+            elif word == "<KIGOU>":
+                words.append("ǀ")
+            elif word == "<U2000U2BFF>":
+                words.append("‖")
+            else:
+                words.append(word)
+        if len(byte_tokens) > 0:
+            words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+        text = "".join(words)
+        return text
+
+
+__all__ = ["GPTNeoXJapaneseTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/__init__.py b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..19e12e75ef8f46a09244a1a0541bec8097ba3a94
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt_oss import *
+    from .modeling_gpt_oss import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d63e742bcc60e77d1107ef6e946802a950a311e2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebe5487650f6a5299cf603daceab51dfd3d1ddd1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bca9ff0da6eb29fad17abc4f49ded01016632254
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fc302feb345616ce82ac20521babfcb6b55b2f8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/configuration_gpt_oss.py b/phivenv/Lib/site-packages/transformers/models/gpt_oss/configuration_gpt_oss.py
new file mode 100644
index 0000000000000000000000000000000000000000..003d38c900a78e42086dab6422a7cc00bddb3739
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_oss/configuration_gpt_oss.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""openai model configuration"""
+
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...modeling_rope_utils import rope_config_validation
+
+
+class GptOssConfig(PretrainedConfig):
+    r"""
+    This will yield a configuration to that of the BERT
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
+
+    """
+
+    model_type = "gpt_oss"
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.self_attn.sinks": "local_rowwise",
+        "layers.*.mlp.experts": "gather",
+        "layers.*.mlp.router": "ep_router",
+        "layers.*.mlp.experts.gate_up_proj": "grouped_gemm",
+        "layers.*.mlp.experts.gate_up_proj_bias": "grouped_gemm",
+        "layers.*.mlp.experts.down_proj": "grouped_gemm",
+        "layers.*.mlp.experts.down_proj_bias": "grouped_gemm",
+    }
+
+    def __init__(
+        self,
+        num_hidden_layers: int = 36,
+        num_local_experts: int = 128,
+        vocab_size: int = 201088,
+        hidden_size: int = 2880,
+        intermediate_size: int = 2880,
+        head_dim: int = 64,
+        num_attention_heads: int = 64,
+        num_key_value_heads: int = 8,
+        sliding_window: int = 128,
+        rope_theta: float = 150000.0,
+        tie_word_embeddings=False,
+        hidden_act: str = "silu",
+        initializer_range: float = 0.02,
+        max_position_embeddings=131072,
+        rms_norm_eps: float = 1e-5,
+        rope_scaling={
+            "rope_type": "yarn",
+            "factor": 32.0,
+            "beta_fast": 32.0,
+            "beta_slow": 1.0,
+            "truncate": False,
+            "original_max_position_embeddings": 4096,
+        },
+        attention_dropout: float = 0.0,
+        num_experts_per_tok=4,
+        router_aux_loss_coef: float = 0.9,
+        output_router_logits=False,
+        use_cache=True,
+        layer_types=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_local_experts = num_local_experts
+        self.sliding_window = sliding_window
+        self.num_experts_per_tok = num_experts_per_tok
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+        self.attention_bias = True
+        self.max_position_embeddings = max_position_embeddings
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.output_router_logits = output_router_logits
+        self.use_cache = use_cache
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["GptOssConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py b/phivenv/Lib/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf4d42ea58fac967d3f1f8ec2dfab2109c294de
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -0,0 +1,720 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gpt_oss/modular_gpt_oss.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gpt_oss.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations.hub_kernels import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder, check_model_inputs
+from .configuration_gpt_oss import GptOssConfig
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class GptOssRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GptOssRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)  # main diff with Llama
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GptOssExperts(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_size = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+        self.gate_up_proj_bias = nn.Parameter(torch.empty(self.num_experts, 2 * self.expert_dim))
+        self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
+        self.down_proj_bias = nn.Parameter(torch.empty(self.num_experts, self.hidden_size))
+        self.alpha = 1.702
+        self.limit = 7.0
+
+    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
+        """
+        When training it is more efficient to just loop over the experts and compute the output for each expert
+        as otherwise the memory would explode.
+
+        For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
+
+        Args:
+            hidden_states (torch.Tensor): (batch_size, seq_len, hidden_size)
+            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
+            routing_weights (torch.Tensor): (batch_size * token_num, num_experts)
+        Returns:
+            torch.Tensor
+        """
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
+        num_experts = routing_weights.shape[1]
+        if hidden_states.device.type == "cpu" or self.training:
+            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+            with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
+                expert_mask = expert_mask.permute(2, 1, 0)
+                # we sum on the top_k and on the sequence length to get which experts
+                # are hit this time around
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
+                # expert_idx only have 1 element, so we can use scale for fast indexing
+                expert_idx = expert_idx[0]
+                with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx])
+                current_state = hidden_states[token_idx]
+                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
+                gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+                gate = gate.clamp(min=None, max=self.limit)
+                up = up.clamp(min=-self.limit, max=self.limit)
+                glu = gate * torch.sigmoid(gate * self.alpha)
+                gated_output = (up + 1) * glu
+                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
+                weighted_output = out * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
+        else:
+            hidden_states = hidden_states.repeat(num_experts, 1)
+            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
+            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+            glu = gate * torch.sigmoid(gate * self.alpha)
+            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
+            next_states = next_states + self.down_proj_bias[..., None, :]
+            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
+            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+            next_states = next_states.sum(dim=0)
+        return next_states
+
+
+class GptOssTopKRouter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.num_local_experts
+        self.hidden_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim))
+        self.bias = nn.Parameter(torch.empty(self.num_experts))
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias)  # (seq_len, num_experts)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)  # (seq_len, top_k)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+
+
+@use_kernel_forward_from_hub("MegaBlocksMoeMLP")
+class GptOssMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.router = GptOssTopKRouter(config)
+        self.experts = GptOssExperts(config)
+
+    def forward(self, hidden_states):
+        router_scores, router_indices = self.router(hidden_states)  # (num_experts, seq_len)
+        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
+        return routed_out, router_scores
+
+
+class GptOssRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GptOssConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = freqs
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(x.dtype), sin.to(x.dtype)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    first_half, second_half = torch.chunk(x, 2, dim=-1)
+    first_ = first_half * cos - second_half * sin
+    second_ = second_half * cos + first_half * sin
+    return torch.cat((first_, second_), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = _apply_rotary_emb(q, cos, sin)
+    k_embed = _apply_rotary_emb(k, cos, sin)
+    return q_embed, k_embed
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    sinks = module.sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
+    combined_logits = torch.cat([attn_weights, sinks], dim=-1)
+
+    # This was not in the original implementation and slightly affect results; it prevents overflow in BF16/FP16
+    # when training with bsz>1 we clamp max values.
+
+    combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values
+    probs = F.softmax(combined_logits, dim=-1, dtype=combined_logits.dtype)
+    scores = probs[..., :-1]  # we drop the sink here
+    attn_weights = nn.functional.dropout(scores, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class GptOssAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GptOssConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
+        self.sinks = nn.Parameter(torch.empty(config.num_attention_heads))
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            s_aux=self.sinks,  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class GptOssDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GptOssConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = GptOssAttention(config=config, layer_idx=layer_idx)
+        self.mlp = GptOssMLP(config)
+        self.input_layernorm = GptOssRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GptOssRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, _ = self.mlp(hidden_states)  # diff with llama: router scores
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class GptOssPreTrainedModel(PreTrainedModel):
+    config: GptOssConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GptOssDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = False
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(GptOssTopKRouter, index=0),
+        "hidden_states": GptOssDecoderLayer,
+        "attentions": GptOssAttention,
+    }
+    _keep_in_fp32_modules = ["post_attention_layernorm", "input_layernorm", "norm"]
+    _supports_flash_attention = False
+    _supports_flex_attention = False
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Parameter):
+            module.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GptOssRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GptOssExperts):
+            module.gate_up_proj.data.normal_(mean=0.0, std=std)
+            module.gate_up_proj_bias.data.zero_()
+            module.down_proj.data.normal_(mean=0.0, std=std)
+            module.down_proj_bias.data.zero_()
+        elif isinstance(module, GptOssAttention):
+            module.sinks.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, GptOssTopKRouter):
+            module.weight.data.normal_(mean=0.0, std=std)
+            module.bias.data.normal_(mean=0.0, std=std)
+
+
+@auto_docstring
+class GptOssModel(GptOssPreTrainedModel):
+    _no_split_modules = ["GptOssDecoderLayer"]
+
+    def __init__(self, config: GptOssConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GptOssDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GptOssRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = GptOssRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+            }
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+@auto_docstring
+class GptOssForCausalLM(GptOssPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GptOssModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GptOssForCausalLM
+
+        >>> model = GptOssForCausalLM.from_pretrained("mistralai/GptOss-8x7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/GptOss-8x7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_router_logits=output_router_logits,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+class GptOssForSequenceClassification(GenericForSequenceClassification, GptOssPreTrainedModel):
+    pass
+
+
+class GptOssForTokenClassification(GenericForTokenClassification, GptOssPreTrainedModel):
+    pass
+
+
+__all__ = [
+    "GptOssForCausalLM",
+    "GptOssForSequenceClassification",
+    "GptOssForTokenClassification",
+    "GptOssModel",
+    "GptOssPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_oss/modular_gpt_oss.py b/phivenv/Lib/site-packages/transformers/models/gpt_oss/modular_gpt_oss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9203860cc5e04e5469212ab9b8f5993be48f5b95
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_oss/modular_gpt_oss.py
@@ -0,0 +1,467 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ...cache_utils import Cache, DynamicCache
+from ...integrations.hub_kernels import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_outputs import (
+    MoeModelOutputWithPast,
+)
+from ...modeling_rope_utils import dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder, check_model_inputs
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    repeat_kv,
+)
+from ..mixtral.modeling_mixtral import (
+    MixtralForCausalLM,
+    MixtralForSequenceClassification,
+    MixtralForTokenClassification,
+    MixtralModel,
+)
+from ..qwen2.modeling_qwen2 import Qwen2Attention
+from .configuration_gpt_oss import GptOssConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class GptOssRMSNorm(LlamaRMSNorm):
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)  # main diff with Llama
+
+
+class GptOssExperts(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_size = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+        self.gate_up_proj_bias = nn.Parameter(torch.empty(self.num_experts, 2 * self.expert_dim))
+        self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
+        self.down_proj_bias = nn.Parameter(torch.empty(self.num_experts, self.hidden_size))
+        self.alpha = 1.702
+        self.limit = 7.0
+
+    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
+        """
+        When training it is more efficient to just loop over the experts and compute the output for each expert
+        as otherwise the memory would explode.
+
+        For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
+
+        Args:
+            hidden_states (torch.Tensor): (batch_size, seq_len, hidden_size)
+            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
+            routing_weights (torch.Tensor): (batch_size * token_num, num_experts)
+        Returns:
+            torch.Tensor
+        """
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
+        num_experts = routing_weights.shape[1]
+        if hidden_states.device.type == "cpu" or self.training:
+            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+            with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
+                expert_mask = expert_mask.permute(2, 1, 0)
+                # we sum on the top_k and on the sequence length to get which experts
+                # are hit this time around
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
+                # expert_idx only have 1 element, so we can use scale for fast indexing
+                expert_idx = expert_idx[0]
+                with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx])
+                current_state = hidden_states[token_idx]
+                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
+                gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+                gate = gate.clamp(min=None, max=self.limit)
+                up = up.clamp(min=-self.limit, max=self.limit)
+                glu = gate * torch.sigmoid(gate * self.alpha)
+                gated_output = (up + 1) * glu
+                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
+                weighted_output = out * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
+        else:
+            hidden_states = hidden_states.repeat(num_experts, 1)
+            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
+            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+            glu = gate * torch.sigmoid(gate * self.alpha)
+            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
+            next_states = next_states + self.down_proj_bias[..., None, :]
+            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
+            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+            next_states = next_states.sum(dim=0)
+        return next_states
+
+
+class GptOssTopKRouter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.num_local_experts
+        self.hidden_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim))
+        self.bias = nn.Parameter(torch.empty(self.num_experts))
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias)  # (seq_len, num_experts)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)  # (seq_len, top_k)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+
+
+@use_kernel_forward_from_hub("MegaBlocksMoeMLP")
+class GptOssMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.router = GptOssTopKRouter(config)
+        self.experts = GptOssExperts(config)
+
+    def forward(self, hidden_states):
+        router_scores, router_indices = self.router(hidden_states)  # (num_experts, seq_len)
+        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
+        return routed_out, router_scores
+
+
+class GptOssRotaryEmbedding(LlamaRotaryEmbedding):
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = freqs
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(x.dtype), sin.to(x.dtype)
+
+
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    first_half, second_half = torch.chunk(x, 2, dim=-1)
+    first_ = first_half * cos - second_half * sin
+    second_ = second_half * cos + first_half * sin
+    return torch.cat((first_, second_), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = _apply_rotary_emb(q, cos, sin)
+    k_embed = _apply_rotary_emb(k, cos, sin)
+    return q_embed, k_embed
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    sinks = module.sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
+    combined_logits = torch.cat([attn_weights, sinks], dim=-1)
+
+    # This was not in the original implementation and slightly affect results; it prevents overflow in BF16/FP16
+    # when training with bsz>1 we clamp max values.
+
+    combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values
+    probs = F.softmax(combined_logits, dim=-1, dtype=combined_logits.dtype)
+    scores = probs[..., :-1]  # we drop the sink here
+    attn_weights = nn.functional.dropout(scores, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class GptOssAttention(Qwen2Attention):
+    def __init__(self, config: GptOssConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.sinks = nn.Parameter(torch.empty(config.num_attention_heads))
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            s_aux=self.sinks,  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class GptOssDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: GptOssConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.hidden_size = config.hidden_size
+        self.self_attn = GptOssAttention(config=config, layer_idx=layer_idx)
+        self.mlp = GptOssMLP(config)
+        self.input_layernorm = GptOssRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GptOssRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, _ = self.mlp(hidden_states)  # diff with llama: router scores
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class GptOssPreTrainedModel(LlamaPreTrainedModel):
+    _keep_in_fp32_modules = ["post_attention_layernorm", "input_layernorm", "norm"]
+    _supports_sdpa = False
+    _supports_flash_attention = False
+    _supports_flex_attention = False
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(GptOssTopKRouter, index=0),
+        "hidden_states": GptOssDecoderLayer,
+        "attentions": GptOssAttention,
+    }
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Parameter):
+            module.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GptOssRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GptOssExperts):
+            module.gate_up_proj.data.normal_(mean=0.0, std=std)
+            module.gate_up_proj_bias.data.zero_()
+            module.down_proj.data.normal_(mean=0.0, std=std)
+            module.down_proj_bias.data.zero_()
+        elif isinstance(module, GptOssAttention):
+            module.sinks.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, GptOssTopKRouter):
+            module.weight.data.normal_(mean=0.0, std=std)
+            module.bias.data.normal_(mean=0.0, std=std)
+
+
+class GptOssModel(MixtralModel):
+    _no_split_modules = ["GptOssDecoderLayer"]
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+            }
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+class GptOssForCausalLM(MixtralForCausalLM):
+    pass
+
+
+class GptOssForSequenceClassification(MixtralForSequenceClassification):
+    pass
+
+
+class GptOssForTokenClassification(MixtralForTokenClassification):
+    pass
+
+
+__all__ = [
+    "GptOssForCausalLM",
+    "GptOssForSequenceClassification",
+    "GptOssForTokenClassification",
+    "GptOssModel",
+    "GptOssPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__init__.py b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e477eb1d2cc2ff0e34d214ed99ad3f80afe8ab0a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_gpt_sw3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c2c733c8adf899dd4c2727ba22c7d8d64c254f2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__pycache__/tokenization_gpt_sw3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__pycache__/tokenization_gpt_sw3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1979853af9b84cba242851495219cc333bd7144
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/__pycache__/tokenization_gpt_sw3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
new file mode 100644
index 0000000000000000000000000000000000000000..3019acfd5bcc360502ea798b22df8fff71dd2b81
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -0,0 +1,301 @@
+"""The tokenizer used by the GPT-SW3 models."""
+
+import os
+import re
+import unicodedata
+from shutil import copyfile
+from typing import Any, Optional, Union
+
+import sentencepiece as spm
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import is_torch_available, logging
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+
+@requires(backends=("sentencepiece",))
+class GPTSw3Tokenizer(PreTrainedTokenizer):
+    """
+    Construct an GPTSw3 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Example usage:
+    ```python
+    >>> from transformers import GPTSw3Tokenizer
+
+    >>> tokenizer = GPTSw3Tokenizer.from_pretrained("AI-Sweden-Models/gpt-sw3-126m")
+    >>> tokenizer("Svenska är kul!")["input_ids"]
+    [1814, 377, 3617, 63504]
+    ```
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `False`):
+            Whether or not to keep accents when tokenizing.
+        pad_token (`str`, *optional*):
+            The token used for padding, for example when batching sequences of different lengths. If not provided, will
+            default to '<pad>' or '<unk>' depending on model size.
+        unk_token (`str`, *optional*):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. If not provided, will default to '<unk>'.
+        eos_token (`str`, *optional*):
+            The end of sequence token seen during pretraining. If not provided, will default to '<|endoftext|>'
+        bos_token (`str`, *optional*):
+            The beginning of sequence token that can be used for downstream task, was not seen during pretraining. If
+            not provided, will default to '<s>' or '<|endoftext|>', depending on model size.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+        whitespaces (`set`):
+            The whitespaces that are replaced in the whitespace normalization in preprocessing.
+        non_printing_characters_re (`Pattern`):
+            The compiled regular expression to remove non-printing characters in preprocessing.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=False,
+        keep_accents=False,
+        pad_token=None,
+        unk_token=None,
+        eos_token=None,
+        bos_token=None,
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        name_or_path = kwargs.get("name_or_path")
+        if name_or_path is None:
+            logger.warning(
+                "name_or_path not provided, will work for all GPTSw3 models except gpt-sw3-7b,"
+                " you are testing the model, this can safely be ignored"
+            )
+            name_or_path = "None"
+
+        # Default definitions for our 2 tokenizer versions, with None-checks to enable proper testing
+        eos_token = "<|endoftext|>" if eos_token is None else eos_token
+        unk_token = "<unk>" if unk_token is None else unk_token
+        if "gpt-sw3-7b" in name_or_path:
+            pad_token = unk_token if pad_token is None else pad_token
+            bos_token = eos_token if bos_token is None else bos_token
+        else:
+            pad_token = "<pad>" if pad_token is None else pad_token
+            bos_token = "<s>" if bos_token is None else bos_token
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        # Used for whitespace normalization in input texts
+        # fmt : off
+        self.whitespaces = {" ", " ", " ", " ", " ", "　", " ", " ", " ", " ", "￼", ""}
+        # fmt : on
+
+        # Regular expression to remove non-printing characters (e.g. some unicode control chars) in preprocessing
+        self.non_printing_characters_re = re.compile(
+            f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]"
+        )
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    @property
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.vocab_size
+    def vocab_size(self) -> int:
+        return len(self.sp_model)
+
+    def preprocess_text(self, text: str) -> str:
+        """
+        Returns the preprocessed text. This procedure is identical to what was used when training the tokenizer.
+        """
+
+        # Remove non-printing characters
+        text = self.non_printing_characters_re.sub("", text)
+
+        # Normalize whitespaces
+        text = "".join([char if char not in self.whitespaces else " " for char in text])
+
+        # NFC Unicode normalization
+        text = unicodedata.normalize("NFC", text)
+        return text
+
+    def _tokenize(self, text: str, **kwargs) -> list[str]:
+        text = self.preprocess_text(text)
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) to an id (int) using the vocab."""
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (int) to a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        """Returns the input string, this function is overridden to remove the default clean up."""
+        return out_string
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        """Converts a sequence of tokens (strings) to a single string. Special tokens remain intact."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                # TODO: Check if this is needed, as it ensures that decode(encode(doc)) != doc by adding extra whitespace in the decoded document
+                if not prev_is_special:
+                    out_string += " "
+
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+
+        return out_string
+
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.get_vocab
+    def get_vocab(self) -> dict[str, int]:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def encode_fast(
+        self, text: Union[str, list[str]], return_tensors: Union[str, bool] = False
+    ) -> Union[list[int], list[list[int]], "torch.Tensor"]:
+        """
+        Encodes a text or batch of texts to token ids using preprocessing and the raw SP tokenizer. This has reduced
+        functionality but is often much faster.
+
+        Does NOT handle special tokens correctly, these can manually be added as ids afterwards.
+
+        Does NOT support padding, these can manually be added as ids afterwards.
+
+        Use default HuggingFace tokenization methods for full functionality.
+
+        Args:
+            text (`str` or `list[str]`): One or several text(s) to convert to token ids.
+            return_tensors (`str` or `bool`): Returns PyTorch tensors if set to True or "pt"
+
+        Returns:
+            `list[int]`, `list[list[int]]`, or `torch.Tensor`: The encoded text(s) as token ids.
+        """
+
+        if isinstance(text, str):
+            text = self.preprocess_text(text)
+            token_ids = self.sp_model.encode(text)
+        else:
+            text = [self.preprocess_text(t) for t in text]
+            token_ids = self.sp_model.encode(text)
+
+        if return_tensors is True or return_tensors == "pt":
+            token_ids = torch.tensor(token_ids)
+
+        return token_ids
+
+    def decode_fast(self, token_ids: Union[int, list[int]]) -> str:
+        """
+        Encodes a text or batch of texts to token ids using preprocessing and the raw SP tokenizer. This has reduced
+        functionality but is often much faster.
+
+        Args:
+            token_ids (`int` or `list[int]`): Encoded token or text as token id(s).
+
+        Returns:
+            `str`: Decoded text
+        """
+
+        return self.sp_model.decode(token_ids)
+
+
+__all__ = ["GPTSw3Tokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/__init__.py b/phivenv/Lib/site-packages/transformers/models/gptj/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84d99fda2e6996d80c8dd32cf1247e61fb83230b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gptj/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gptj import *
+    from .modeling_flax_gptj import *
+    from .modeling_gptj import *
+    from .modeling_tf_gptj import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d307e96535b8daad5df1b03356e5586f059207f0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c923acd0d388168fcd10c392fb874beac2a68af8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d86a8fddc054f2272c786ce547ea31d001d0b6e1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36cb31e8a6738ef7e6500ad780676d3b6f7866b0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7939a809a6d837a04e3c04542ef339df86d69afc
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/configuration_gptj.py b/phivenv/Lib/site-packages/transformers/models/gptj/configuration_gptj.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c6909968808cba2a5ab20a739b61f8bb2e6a88
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gptj/configuration_gptj.py
@@ -0,0 +1,220 @@
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT-J model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PreTrainedTokenizer, TensorType, is_torch_available
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfigWithPast, PatchingSpec
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTJConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTJModel`]. It is used to instantiate a GPT-J
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the GPT-J
+    [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B) architecture. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50400):
+            Vocabulary size of the GPT-J model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTJModel`].
+        n_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 4096):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        rotary_dim (`int`, *optional*, defaults to 64):
+            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from transformers import GPTJModel, GPTJConfig
+
+    >>> # Initializing a GPT-J 6B configuration
+    >>> configuration = GPTJConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = GPTJModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gptj"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50400,
+        n_positions=2048,
+        n_embd=4096,
+        n_layer=28,
+        n_head=16,
+        rotary_dim=64,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.rotary_dim = rotary_dim
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
+
+
+# Copied from transformers.models.gpt2.configuration_gpt2.GPT2OnnxConfig
+class GPTJOnnxConfig(OnnxConfigWithPast):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+        if not getattr(self._config, "pad_token_id", None):
+            # TODO: how to do that better?
+            self._config.pad_token_id = 0
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                past_shape = (
+                    batch,
+                    self.num_attention_heads,
+                    past_key_values_length,
+                    self._config.hidden_size // self.num_attention_heads,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            mask_dtype = ordered_inputs["attention_mask"].dtype
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
+
+
+__all__ = ["GPTJConfig", "GPTJOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/modeling_flax_gptj.py b/phivenv/Lib/site-packages/transformers/models/gptj/modeling_flax_gptj.py
new file mode 100644
index 0000000000000000000000000000000000000000..12ea7a4fffb4f547fd98ab8d1e9ce3de7a33f9b6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gptj/modeling_flax_gptj.py
@@ -0,0 +1,721 @@
+# coding=utf-8
+# Copyright 2021 The EleutherAI and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_gptj import GPTJConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "gptj"
+_CONFIG_FOR_DOC = "GPTJConfig"
+
+
+GPTJ_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`GPTJConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+GPTJ_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
+    sinusoid_inp = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+    sin, cos = np.sin(sinusoid_inp), np.cos(sinusoid_inp)
+
+    sentinel = dim // 2 + dim % 2
+    out = np.zeros((num_pos, dim))
+    out[:, 0:sentinel] = sin
+    out[:, sentinel:] = cos
+
+    return jnp.array(out)
+
+
+def rotate_every_two(tensor):
+    rotate_half_tensor = jnp.stack((-tensor[:, :, :, 1::2], tensor[:, :, :, ::2]), axis=-1)
+    rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
+    return rotate_half_tensor
+
+
+def apply_rotary_pos_emb(tensor, sincos):
+    sin_pos, cos_pos = sincos
+    sin_pos = sin_pos[:, :, None, :].repeat(2, 3)
+    cos_pos = cos_pos[:, :, None, :].repeat(2, 3)
+    return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos)
+
+
+class FlaxGPTJAttention(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
+
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        self.rotary_dim = config.rotary_dim
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.out_proj = dense()
+
+        self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
+
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+
+        pos_embd_dim = self.rotary_dim or self.embed_dim
+        self.embed_positions = create_sinusoidal_positions(config.max_position_embeddings, pos_embd_dim)
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key
+            # positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+
+        sincos = jnp.take(self.embed_positions, position_ids, axis=0)
+        sincos = jnp.split(sincos, 2, axis=-1)
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            k_rot = apply_rotary_pos_emb(k_rot, sincos)
+            q_rot = apply_rotary_pos_emb(q_rot, sincos)
+
+            key = jnp.concatenate([k_rot, k_pass], axis=-1)
+            query = jnp.concatenate([q_rot, q_pass], axis=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sincos)
+            query = apply_rotary_pos_emb(query, sincos)
+
+        query_length, key_length = query.shape[1], key.shape[1]
+
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
+
+        dropout_rng = None
+        if not deterministic and self.config.attn_pdrop > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+        )
+
+        # usual dot product attention
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attn_pdrop,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxGPTJMLP(nn.Module):
+    config: GPTJConfig
+    intermediate_size: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+
+        self.fc_in = nn.Dense(self.intermediate_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.fc_out = nn.Dense(embed_dim, dtype=self.dtype, kernel_init=kernel_init)
+
+        self.act = ACT2FN[self.config.activation_function]
+        self.dropout = nn.Dropout(rate=self.config.resid_pdrop)
+
+    def __call__(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxGPTJBlock(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        hidden_size = self.config.hidden_size
+        inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.attn = FlaxGPTJAttention(self.config, dtype=self.dtype)
+
+        self.mlp = FlaxGPTJMLP(self.config, inner_dim, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+
+        feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
+        # residual connection
+        hidden_states = attn_output + feed_forward_hidden_states + residual
+
+        return (hidden_states,) + attn_outputs[1:]
+
+
+class FlaxGPTJPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTJConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: GPTJConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                position_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return init_variables["cache"]
+
+    @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: Optional[dict] = None,
+        past_key_values: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPTJAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxGPTJBlockCollection(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxGPTJBlock(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = block(
+                hidden_states,
+                attention_mask,
+                position_ids=position_ids,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxGPTJModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+class FlaxGPTJModule(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+
+        self.wte = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
+        self.h = FlaxGPTJBlockCollection(self.config, dtype=self.dtype)
+        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.wte(input_ids.astype("i4"))
+
+        hidden_states = self.dropout(input_embeds, deterministic=deterministic)
+
+        outputs = self.h(
+            hidden_states,
+            attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare GPTJ Model transformer outputting raw hidden-states without any specific head on top.",
+    GPTJ_START_DOCSTRING,
+)
+class FlaxGPTJModel(FlaxGPTJPreTrainedModel):
+    module_class = FlaxGPTJModule
+
+
+append_call_sample_docstring(
+    FlaxGPTJModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxGPTJForCausalLMModule(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.transformer = FlaxGPTJModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask,
+            position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The GPTJ Model transformer with a language modeling head on top.
+    """,
+    GPTJ_START_DOCSTRING,
+)
+class FlaxGPTJForCausalLM(FlaxGPTJPreTrainedModel):
+    module_class = FlaxGPTJForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPTJ uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxGPTJForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = ["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/modeling_gptj.py b/phivenv/Lib/site-packages/transformers/models/gptj/modeling_gptj.py
new file mode 100644
index 0000000000000000000000000000000000000000..73666daed0ad45619df301f190720b3545f4e4d8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gptj/modeling_gptj.py
@@ -0,0 +1,1238 @@
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPT-J model."""
+
+import warnings
+from typing import Optional, Union
+
+import torch
+import torch.fx
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    auto_docstring,
+    is_torch_flex_attn_available,
+    is_torch_fx_proxy,
+    logging,
+)
+from ...utils.model_parallel_utils import assert_device_map, get_device_map
+from .configuration_gptj import GPTJConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+
+def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
+    inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim))
+    sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq).float()
+    return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
+
+
+@torch.fx.wrap
+def get_embed_positions(embed_positions, position_ids):
+    return embed_positions.to(position_ids.device).repeat(position_ids.shape[0], 1, 1)
+
+
+def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[:, :, :, ::2]
+    x2 = x[:, :, :, 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')
+
+
+def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
+    sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
+    cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
+    return (tensor * cos) + (rotate_every_two(tensor) * sin)
+
+
+class GPTJAttention(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        max_positions = config.max_position_embeddings
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.is_causal = True
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.embed_dim = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_attention_heads
+        if self.head_dim * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
+                f" `num_attention_heads`: {self.num_attention_heads})."
+            )
+        self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.rotary_dim = config.rotary_dim
+        pos_embd_dim = self.rotary_dim or self.embed_dim
+        self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)
+
+    def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        if rotary:
+            return tensor
+        if len(tensor.shape) == 5:
+            return tensor.permute(0, 1, 3, 2, 4)  # (batch, blocks, head, block_length, head_features)
+        elif len(tensor.shape) == 4:
+            return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+
+    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        if len(tensor.shape) == 5:
+            tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
+        elif len(tensor.shape) == 4:
+            tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+        new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def _attn(
+        self,
+        query,
+        key,
+        value,
+        attention_mask=None,
+        head_mask=None,
+    ):
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = query.to(torch.float32)
+        key = key.to(torch.float32)
+
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        attn_weights = attn_weights / self.scale_attn
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _get_embed_positions(self, position_ids):
+        embed_positions = self.embed_positions
+        if embed_positions.device != position_ids.device:
+            embed_positions = embed_positions.to(position_ids.device)
+            self.embed_positions = embed_positions
+        return embed_positions.repeat(position_ids.shape[0], 1, 1)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[
+        tuple[torch.Tensor, tuple[torch.Tensor]],
+        Optional[tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]]],
+    ]:
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim, True)
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim, True)
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim, False)
+
+        if is_torch_fx_proxy(position_ids) or torch.jit.is_tracing():
+            # The logic to conditionally copy to GPU could not be traced, so we do this
+            # every time in the torch.fx case
+            embed_positions = get_embed_positions(self.embed_positions, position_ids)
+        else:
+            embed_positions = self._get_embed_positions(position_ids)
+
+        repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
+        sincos = torch.gather(embed_positions, 1, repeated_position_ids).to(key.dtype)
+        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
+            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
+
+            key = torch.cat([k_rot, k_pass], dim=-1)
+            query = torch.cat([q_rot, q_pass], dim=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sin, cos)
+            query = apply_rotary_pos_emb(query, sin, cos)
+
+        key = key.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+
+        if layer_past is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_dim,
+                "cache_position": cache_position,
+            }
+            key, value = layer_past.update(key, value, self.layer_idx, cache_kwargs)
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights
+
+
+class GPTJFlashAttention2(GPTJAttention):
+    """
+    GPTJ flash attention module. This module inherits from `GPTJAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[
+        tuple[torch.Tensor, tuple[torch.Tensor]],
+        Optional[tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]]],
+    ]:
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim, True)
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim, True)
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim, False)
+
+        if is_torch_fx_proxy(position_ids) or torch.jit.is_tracing():
+            # The logic to conditionally copy to GPU could not be traced, so we do this
+            # every time in the torch.fx case
+            embed_positions = get_embed_positions(self.embed_positions, position_ids)
+        else:
+            embed_positions = self._get_embed_positions(position_ids)
+
+        repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
+        sincos = torch.gather(embed_positions, 1, repeated_position_ids).to(key.dtype)
+        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
+            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
+
+            key = torch.cat([k_rot, k_pass], dim=-1)
+            query = torch.cat([q_rot, q_pass], dim=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sin, cos)
+            query = apply_rotary_pos_emb(query, sin, cos)
+
+        # tanspose to have the desired shape
+        # before transpose: batch_size x seq_length x num_attention_heads x head_dim
+        # after transpose: batch_size x num_attention_heads x seq_length x head_dim
+        key = key.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+        # value: batch_size x num_attention_heads x seq_length x head_dim
+
+        if layer_past is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_dim,
+                "cache_position": cache_position,
+            }
+            key, value = layer_past.update(key, value, self.layer_idx, cache_kwargs)
+
+        # The Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we need to keep the original shape for query and key, and reshape value
+        # to have the correct shape.
+        key = key.permute(0, 2, 1, 3).contiguous()
+        query = query.permute(0, 2, 1, 3).contiguous()
+        value = value.permute(0, 2, 1, 3).contiguous()
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query.dtype
+        device_type = query.device.type if query.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query = query.to(target_dtype)
+            key = key.to(target_dtype)
+            value = value.to(target_dtype)
+
+        attention_dropout = self.config.attn_pdrop if self.training else 0.0  # attn_pdrop in gptj
+
+        query_length = query.shape[1]
+
+        # Compute attention
+        attn_weights = _flash_attention_forward(
+            query,
+            key,
+            value,
+            attention_mask,
+            query_length,
+            dropout=attention_dropout,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        # Reshape outputs
+        attn_output = attn_weights.reshape(
+            attn_weights.shape[0], attn_weights.shape[1], attn_weights.shape[2] * attn_weights.shape[3]
+        )
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output, attn_weights
+
+
+GPTJ_ATTENTION_CLASSES = {
+    "eager": GPTJAttention,
+    "flash_attention_2": GPTJFlashAttention2,
+}
+
+
+class GPTJMLP(nn.Module):
+    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * embed_dim
+        super().__init__()
+        embed_dim = config.n_embd
+
+        self.fc_in = nn.Linear(embed_dim, intermediate_size)
+        self.fc_out = nn.Linear(intermediate_size, embed_dim)
+
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor:
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPTJBlock(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = GPTJ_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = GPTJMLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs, attn_weights = self.attn(
+            hidden_states=hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_outputs + feed_forward_hidden_states + residual
+
+        return hidden_states, attn_weights
+
+
+@auto_docstring
+class GPTJPreTrainedModel(PreTrainedModel):
+    config: GPTJConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTJBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _can_compile_fullgraph = True
+    _supports_param_buffer_assignment = False
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            # Slightly different from Mesh Transformer JAX which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice. Uses a device map to distribute
+    attention modules of the model across several devices. If no device map is given, it will evenly distribute blocks
+    across all devices.
+
+    Args:
+        device_map (`dict[int, list]`, *optional*):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
+            following number of attention modules:
+
+                - gpt-j-6B: 28
+
+    Example:
+
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs using gpt-j-6B, which has a total of 28 attention modules:
+    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6],
+        1: [7, 8, 9, 10, 11, 12, 13],
+        2: [14, 15, 16, 17, 18, 19, 20],
+        3: [21, 22, 23, 24, 25, 26, 27],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to CPU from a model parallel state.
+
+    Example:
+
+    ```python
+    # On a 4 GPU machine with gpt-j-6B:
+    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6],
+        1: [7, 8, 9, 10, 11, 12, 13],
+        2: [14, 15, 16, 17, 18, 19, 20],
+        3: [21, 22, 23, 24, 25, 26, 27],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+
+
+@auto_docstring
+class GPTJModel(GPTJPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPTJBlock(config, layer_idx=i) for i in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPTJModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
+            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
+            " ...}",
+            FutureWarning,
+        )
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for block in v:
+                cuda_device = "cuda:" + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        # ln_f to last
+        self.ln_f = self.ln_f.to(self.last_device)
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        seq_length = inputs_embeds.shape[1]
+        if cache_position is None:
+            past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x num_attention_heads x N x N
+        # head_mask has shape n_layer x batch x num_attention_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        hidden_states = inputs_embeds
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, seq_length)
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+        output_shape = (-1, seq_length, hidden_states.size(-1))
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if past_key_values is not None:
+                    for layer in past_key_values.layers:
+                        layer.keys = layer.keys.to(hidden_states.device)
+                        layer.values = layer.values.to(hidden_states.device)
+
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if causal_mask is not None:
+                    causal_mask = causal_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                layer_past=past_key_values,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT-J Model transformer with a language modeling head on top.
+    """
+)
+class GPTJForCausalLM(GPTJPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPTJModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPTJForCausalLM.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
+            " 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        # make sure sampling in fp16 works correctly and
+        # compute loss in fp32 to match with mesh-tf version
+        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+        lm_logits = self.lm_head(hidden_states).to(torch.float32)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Flatten the tokens
+            loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT-J Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTJForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT, GPT-2, GPT-Neo) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class GPTJForSequenceClassification(GPTJPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTJModel(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(pooled_logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPTJForQuestionAnswering(GPTJPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTJModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPTJForCausalLM",
+    "GPTJForQuestionAnswering",
+    "GPTJForSequenceClassification",
+    "GPTJModel",
+    "GPTJPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/gptj/modeling_tf_gptj.py b/phivenv/Lib/site-packages/transformers/models/gptj/modeling_tf_gptj.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ec32258223cea695705b1d07840919ef84c9984
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/gptj/modeling_tf_gptj.py
@@ -0,0 +1,1094 @@
+# coding=utf-8
+# Copyright 2022 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 GPT-J model."""
+
+from __future__ import annotations
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPast,
+    TFCausalLMOutputWithPast,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutputWithPast,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSharedEmbeddings,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import logging
+from .configuration_gptj import GPTJConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-j-6B"
+_CONFIG_FOR_DOC = "GPTJConfig"
+
+
+def create_sinusoidal_positions(num_pos: int, dim: int) -> tf.Tensor:
+    inv_freq = tf.cast(1.0 / (10000 ** (tf.range(0, dim, 2) / dim)), tf.float32)
+    sinusoid_inp = tf.cast(tf.einsum("i , j -> i j", tf.range(num_pos, dtype=tf.float32), inv_freq), tf.float32)
+    sin, cos = tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)
+    out = tf.concat((sin, cos), axis=1)
+    return out
+
+
+def rotate_every_two(x: tf.Tensor) -> tf.Tensor:
+    rotate_half_tensor = tf.stack((-x[:, :, :, 1::2], x[:, :, :, ::2]), axis=-1)
+    new_shape = shape_list(rotate_half_tensor)[:-2] + [tf.math.reduce_prod(shape_list(rotate_half_tensor)[-2:])]
+    rotate_half_tensor = tf.reshape(rotate_half_tensor, new_shape)
+    return rotate_half_tensor
+
+
+def apply_rotary_pos_emb(tensor: tf.Tensor, sincos: tf.Tensor) -> tf.Tensor:
+    sin_pos, cos_pos = sincos
+    sin_pos = tf.repeat(sin_pos[:, :, None, :], 2, 3)
+    cos_pos = tf.repeat(cos_pos[:, :, None, :], 2, 3)
+    return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos)
+
+
+class TFGPTJAttention(keras.layers.Layer):
+    def __init__(self, config: GPTJConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_attention_heads
+        if self.head_dim * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
+                f" `num_attention_heads`: {self.num_attention_heads})."
+            )
+        self.scale_attn = self.head_dim**0.5
+        self.rotary_dim = config.rotary_dim
+
+        self.attn_dropout = keras.layers.Dropout(config.attn_pdrop)
+        self.resid_dropout = keras.layers.Dropout(config.resid_pdrop)
+
+        self.q_proj = keras.layers.Dense(
+            self.embed_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="q_proj",
+        )
+        self.k_proj = keras.layers.Dense(
+            self.embed_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="k_proj",
+        )
+        self.v_proj = keras.layers.Dense(
+            self.embed_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="v_proj",
+        )
+        self.out_proj = keras.layers.Dense(
+            self.embed_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="out_proj",
+        )
+
+        self.max_positions = config.max_position_embeddings
+        self.lower_triangle_mask = tf.reshape(
+            tf.cast(tf.experimental.numpy.tril(tf.ones((self.max_positions, self.max_positions))), tf.int8),
+            (1, 1, self.max_positions, self.max_positions),
+        )
+        pos_embd_dim = self.rotary_dim or self.embed_dim
+        self.embed_positions = create_sinusoidal_positions(self.max_positions, pos_embd_dim)
+
+    def get_causal_mask(self, key_length, query_length) -> tf.Tensor:
+        return tf.cast(self.lower_triangle_mask[:, :, key_length - query_length : key_length, :key_length], tf.bool)
+
+    @staticmethod
+    def get_masked_bias(dtype: tf.DType) -> tf.Tensor:
+        return tf.cast(tf.constant(-1e9), dtype)
+
+    def _split_heads(self, hidden_states: tf.Tensor, rotary: bool) -> tf.Tensor:
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        new_shape = shape_list(hidden_states)[:-1] + [self.num_attention_heads, self.head_dim]
+        hidden_states = tf.reshape(hidden_states, new_shape)
+        if rotary:
+            return hidden_states
+        if len(shape_list(hidden_states)) == 4:
+            return tf.transpose(hidden_states, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
+        if len(shape_list(hidden_states)) == 5:
+            return tf.transpose(hidden_states, (0, 1, 3, 2, 4))  # (batch, blocks, head, block_length, head_features)
+        raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(shape_list(hidden_states))}")
+
+    def _merge_heads(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        if len(shape_list(hidden_states)) == 4:
+            hidden_states = tf.transpose(hidden_states, (0, 2, 1, 3))
+        elif len(shape_list(hidden_states)) == 5:
+            hidden_states = tf.transpose(hidden_states, (0, 1, 3, 2, 4))
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(shape_list(hidden_states))}")
+        new_shape = shape_list(hidden_states)[:-2] + [self.num_attention_heads * self.head_dim]
+        return tf.reshape(hidden_states, new_shape)
+
+    def _attn(
+        self,
+        query: tf.Tensor,
+        key: tf.Tensor,
+        value: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+    ) -> tuple[tf.Tensor, tf.Tensor]:
+        # compute causal mask from causal mask buffer
+        query_length, key_length = shape_list(query)[-2], shape_list(key)[-2]
+        causal_mask = self.get_causal_mask(key_length, query_length)
+
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = tf.cast(query, tf.float32)
+        key = tf.cast(key, tf.float32)
+
+        attn_weights = tf.matmul(query, key, transpose_b=True)
+        attn_weights = tf.where(causal_mask, attn_weights, self.get_masked_bias(attn_weights.dtype))
+
+        attn_weights = attn_weights / self.scale_attn
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+        attn_weights = tf.cast(attn_weights, value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = tf.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        layer_past: tuple[tf.Tensor, tf.Tensor] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, True)
+        key = self._split_heads(key, True)
+        value = self._split_heads(value, False)
+
+        sincos = tf.cast(tf.gather(self.embed_positions, position_ids, axis=0), hidden_states.dtype)
+        sincos = tf.split(sincos, 2, axis=-1)
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            k_rot = apply_rotary_pos_emb(k_rot, sincos)
+            q_rot = apply_rotary_pos_emb(q_rot, sincos)
+
+            key = tf.concat((k_rot, k_pass), axis=-1)
+            query = tf.concat((q_rot, q_pass), axis=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sincos)
+            query = apply_rotary_pos_emb(query, sincos)
+
+        key = tf.transpose(key, (0, 2, 1, 3))
+        query = tf.transpose(query, (0, 2, 1, 3))
+
+        if layer_past is not None:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = tf.concat((past_key, key), axis=-2)
+            value = tf.concat((past_value, value), axis=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+class TFGPTJMLP(keras.layers.Layer):
+    def __init__(self, intermediate_size: int, config: GPTJConfig, **kwargs):
+        super().__init__(**kwargs)
+        embed_dim = config.n_embd
+
+        self.fc_in = keras.layers.Dense(
+            intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="fc_in"
+        )
+        self.fc_out = keras.layers.Dense(
+            embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="fc_out"
+        )
+
+        self.act = get_tf_activation(config.activation_function)
+        self.dropout = keras.layers.Dropout(config.embd_pdrop)
+        self.embed_dim = config.n_embd
+        self.intermediate_size = intermediate_size
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "fc_in", None) is not None:
+            with tf.name_scope(self.fc_in.name):
+                self.fc_in.build([None, None, self.embed_dim])
+        if getattr(self, "fc_out", None) is not None:
+            with tf.name_scope(self.fc_out.name):
+                self.fc_out.build([None, None, self.intermediate_size])
+
+
+class TFGPTJBlock(keras.layers.Layer):
+    def __init__(self, config: GPTJConfig, **kwargs):
+        super().__init__(**kwargs)
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
+        self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.attn = TFGPTJAttention(config, name="attn")
+        self.mlp = TFGPTJMLP(inner_dim, config, name="mlp")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        layer_past: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states=hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )  # attn_outputs: attn_output, present, (attentions)
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_output + feed_forward_hidden_states + residual
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+        return outputs  # hidden_states, present, (attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "ln_1", None) is not None:
+            with tf.name_scope(self.ln_1.name):
+                self.ln_1.build([None, None, self.config.n_embd])
+        if getattr(self, "attn", None) is not None:
+            with tf.name_scope(self.attn.name):
+                self.attn.build(None)
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+
+
+@keras_serializable
+class TFGPTJMainLayer(keras.layers.Layer):
+    config_class = GPTJConfig
+
+    def __init__(self, config: GPTJConfig, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+        self.config = config
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.use_cache = config.use_cache
+        self.return_dict = config.use_return_dict
+
+        self.num_hidden_layers = config.n_layer
+        self.n_embd = config.n_embd
+        self.n_positions = config.n_positions
+        self.initializer_range = config.initializer_range
+
+        self.wte = TFSharedEmbeddings(
+            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
+        )
+        self.drop = keras.layers.Dropout(config.embd_pdrop)
+        self.h = [TFGPTJBlock(config, name=f"h_._{i}") for i in range(config.n_layer)]
+        self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
+        self.embed_dim = config.n_embd
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, value: tf.Tensor):
+        self.wte.weight = value
+        self.wte.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ) -> TFBaseModelOutputWithPast | tuple[tf.Tensor]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = [None] * len(self.h)
+        else:
+            past_length = shape_list(past_key_values[0][0])[-2]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0)
+
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask_shape = shape_list(attention_mask)
+            attention_mask = tf.reshape(attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]))
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            one_cst = tf.constant(1.0)
+            attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype)
+            attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), tf.constant(-10000.0))
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.wte.vocab_size)
+            inputs_embeds = self.wte(input_ids, mode="embedding")
+
+        if token_type_ids is not None:
+            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+            token_type_embeds = self.wte(token_type_ids, mode="embedding")
+        else:
+            token_type_embeds = tf.constant(0.0)
+
+        token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype)
+        hidden_states = inputs_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states, training=training)
+
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
+
+        presents = () if use_cache else None
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+
+            outputs = block(
+                hidden_states=hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                training=training,
+            )
+
+            hidden_states = outputs[0]
+            if use_cache:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = tf.reshape(hidden_states, output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+        return TFBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wte", None) is not None:
+            with tf.name_scope(self.wte.name):
+                self.wte.build(None)
+        if getattr(self, "ln_f", None) is not None:
+            with tf.name_scope(self.ln_f.name):
+                self.ln_f.build([None, None, self.embed_dim])
+        if getattr(self, "h", None) is not None:
+            for layer in self.h:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFGPTJPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTJConfig
+    base_model_prefix = "transformer"
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias"]
+
+
+GPTJ_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`GPTJConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPTJ_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]` (`sequence_length` of
+            input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past` output below). Can be used to speed up sequential decoding. The token ids which have their past
+            given to this model should not be passed as input ids as they have already been computed.
+        attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
+            in eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare GPT-J Model transformer outputting raw hidden-states without any specific head on top.",
+    GPTJ_START_DOCSTRING,
+)
+class TFGPTJModel(TFGPTJPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPTJMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPast | tuple[tf.Tensor]:
+        r"""
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past`). Set to `False` during training, `True` during generation
+        """
+
+        outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    The GPT-J Model transformer with a language modeling head on top.
+    """,
+    GPTJ_START_DOCSTRING,
+)
+class TFGPTJForCausalLM(TFGPTJPreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPTJMainLayer(config, name="transformer")
+        self.lm_head = keras.layers.Dense(
+            config.vocab_size, kernel_initializer=get_initializer(config.initializer_range), name="lm_head"
+        )
+        self.config = config
+
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids")
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+            if token_type_ids is not None:
+                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
+
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "token_type_ids": token_type_ids,
+        }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFCausalLMOutputWithPast | tuple[tf.Tensor]:
+        r"""
+        labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = lm_logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels, shifted_logits)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build([None, None, self.config.n_embd])
+
+
+@add_start_docstrings(
+    """
+    The GPT-J Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPTJForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT, GPT-2, GPT-Neo) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GPTJ_START_DOCSTRING,
+)
+class TFGPTJForSequenceClassification(TFGPTJPreTrainedModel, TFSequenceClassificationLoss):
+    _keys_to_ignore_on_load_missing = [r"h.\d+.attn.masked_bias", r"h.\d+.attn.bias", r"lm_head.weight"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.transformer = TFGPTJMainLayer(config, name="transformer")
+        self.score = keras.layers.Dense(
+            self.num_labels,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="score",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutputWithPast | tuple[tf.Tensor]:
+        r"""
+        labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        if labels is not None and self.config.pad_token_id is None and input_ids.shape[0] != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        logits_shape = shape_list(logits)
+        batch_size = logits_shape[0]
+
+        if self.config.pad_token_id is None:
+            last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+        else:
+            if input_ids is not None:
+                token_indices = tf.range(shape_list(input_ids)[-1])
+                non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype)
+                last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1)
+            else:
+                last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1)
+
+        if labels is not None:
+            if self.config.pad_token_id is None and logits_shape[0] != 1:
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels]))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "score", None) is not None:
+            with tf.name_scope(self.score.name):
+                self.score.build([None, None, self.config.n_embd])
+
+
+@add_start_docstrings(
+    """
+    The GPT-J Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    GPTJ_START_DOCSTRING,
+)
+class TFGPTJForQuestionAnswering(TFGPTJPreTrainedModel, TFQuestionAnsweringLoss):
+    _keys_to_ignore_on_load_missing = [r"h.\d+.attn.masked_bias", r"h.\d+.attn.bias", r"lm_head.weight"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.transformer = TFGPTJMainLayer(config, name="transformer")
+        self.qa_outputs = keras.layers.Dense(
+            self.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`np.ndarray` or `tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`np.ndarray` or `tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFGPTJForCausalLM",
+    "TFGPTJForQuestionAnswering",
+    "TFGPTJForSequenceClassification",
+    "TFGPTJModel",
+    "TFGPTJPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/__init__.py b/phivenv/Lib/site-packages/transformers/models/granite/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..08b74d14ca871a7752b61fb71f08b7a7886d80f7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granite import *
+    from .modeling_granite import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f42ba64837a1adcb94ea7caaf804c81ba8a0b343
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c40ab971238c155f7a81224923b189a76be714f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8a65dd67c1e00c8e5beb19680d96c4d92ebc220
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74866ad8527652f76333f9d0444c345af18a649d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/configuration_granite.py b/phivenv/Lib/site-packages/transformers/models/granite/configuration_granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d3ba9e7bb2775e537608d277e5973ec42a8cf9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite/configuration_granite.py
@@ -0,0 +1,197 @@
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Granite model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteModel`]. It is used to instantiate an Granite
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Granite-3B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Granite model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GraniteModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier
+
+    ```python
+    >>> from transformers import GraniteModel, GraniteConfig
+
+    >>> # Initializing a Granite granite-3b style configuration
+    >>> configuration = GraniteConfig()
+
+    >>> # Initializing a model from the granite-7b style configuration
+    >>> model = GraniteModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granite"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `GraniteModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        rope_config_validation(self)
+
+
+__all__ = ["GraniteConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/modeling_granite.py b/phivenv/Lib/site-packages/transformers/models/granite/modeling_granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc458b619def11d7b90cb8bdcacb87183f49b768
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite/modeling_granite.py
@@ -0,0 +1,565 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/granite/modular_granite.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_granite.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_granite import GraniteConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GraniteAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = config.attention_multiplier
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class GraniteRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GraniteRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GraniteMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class GraniteDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GraniteConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = GraniteAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = GraniteMLP(config)
+        self.input_layernorm = GraniteRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GraniteRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.residual_multiplier = config.residual_multiplier
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # main diff with Llama
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class GranitePreTrainedModel(PreTrainedModel):
+    config: GraniteConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GraniteDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": GraniteDecoderLayer,
+        "attentions": GraniteAttention,
+    }
+
+
+class GraniteRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GraniteConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class GraniteModel(GranitePreTrainedModel):
+    def __init__(self, config: GraniteConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GraniteDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GraniteRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = GraniteRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.embedding_multiplier = config.embedding_multiplier
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier  # main diff with Llama
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+@auto_docstring
+class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GraniteModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GraniteForCausalLM
+
+        >>> model = GraniteForCausalLM.from_pretrained("meta-granite/Granite-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-granite/Granite-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits / self.config.logits_scaling  # main diff with Llama
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["GraniteForCausalLM", "GraniteModel", "GranitePreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granite/modular_granite.py b/phivenv/Lib/site-packages/transformers/models/granite/modular_granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0141bd91245cb2c3681eee258b1270390ee9b69
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite/modular_granite.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...cache_utils import Cache, DynamicCache
+from ...masking_utils import create_causal_mask
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaPreTrainedModel,
+)
+from .configuration_granite import GraniteConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteAttention(LlamaAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.scaling = config.attention_multiplier
+
+
+class GraniteDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: GraniteConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.residual_multiplier = config.residual_multiplier
+        self.self_attn = GraniteAttention(config=config, layer_idx=layer_idx)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # main diff with Llama
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class GranitePreTrainedModel(LlamaPreTrainedModel):
+    pass
+
+
+class GraniteModel(LlamaModel):
+    def __init__(self, config: GraniteConfig):
+        super().__init__(config)
+        self.embedding_multiplier = config.embedding_multiplier
+        self.layers = nn.ModuleList(
+            [GraniteDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier  # main diff with Llama
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class GraniteForCausalLM(LlamaForCausalLM):
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits / self.config.logits_scaling  # main diff with Llama
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["GraniteForCausalLM", "GraniteModel", "GranitePreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/__init__.py b/phivenv/Lib/site-packages/transformers/models/granite_speech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6122581855250911b0bf951a42e22d9c354240a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite_speech/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granite_speech import *
+    from .feature_extraction_granite_speech import *
+    from .modeling_granite_speech import *
+    from .processing_granite_speech import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f39551c2cb66efae44f840a095dba354740068d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af0c4db26a1e248bb69353cef056e69b88cdd7c6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ff5a99ec3cb0157f076818e6186694b2a5e7f2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb688ab6faaae75ff314eee6696d813d1fb9e371
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7735563cf927ffff48ed895edcdb5d8bff91e402
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/configuration_granite_speech.py b/phivenv/Lib/site-packages/transformers/models/granite_speech/configuration_granite_speech.py
new file mode 100644
index 0000000000000000000000000000000000000000..fede07b7b7e820e78f44538313a85d39afc811d7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite_speech/configuration_granite_speech.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Config class for Granite Speech."""
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class GraniteSpeechEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteSpeechCTCEncoder`]. It is used to instantiate
+    a Granite Speech audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the dfefaults will yield a similar configuration to that of the audio encoder of the Granite Speech
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        input_dim (`int`, *optional*, defaults to 160):
+            Dimension of the first hidden layer of the encoder.
+        num_layers (`int`, *optional*, defaults to 10):
+            Number of encoder blocks.
+        hidden_dim (`int`, *optional*, defaults to 1024):
+            The size of the intermediate layers in the conformer encoder.
+        feedforward_mult (`int`, *optional*, defaults to 4):
+            Multiplier for the up/down projections in the encoder's feedforward layers;
+            The projections will have intermediate dim of size `hidden_dim * feedforward_mult`.
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim_head (`int`, *optional*, defaults to 128):
+            Dimension of attention heads for each attention layer in the Transformer encoder.
+        output_dim (`int`, *optional*, defaults to 42):
+            Intermediate dimension of the feedforward projections in the conformer
+            to be added to every other encoder block's output.
+        context_size (`int`, *optional*, defaults to 200):
+            Context size to be used in conformer attention.
+        max_pos_emb (`int`, *optional*, defaults to 512):
+            Max pos embeds to be used in attention (shaw's relative positional encoding).
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for fully connected layers in the encoder.
+        conv_kernel_size (`int`, *optional*, defaults to 15):
+            Kernel size to be used for 1D convolution in each conformer block.
+        conv_expansion_factor (`int`, *optional*, defaults to 2):
+            Intermediate dimension to be used in conformer convolutions.
+
+    Example:
+
+    ```python
+    >>> from transformers import GraniteSpeechEncoderConfig, GraniteSpeechCTCEncoder
+
+    >>> # Initializing a GraniteSpeechEncoderConfig
+    >>> configuration = GraniteSpeechEncoderConfig()
+
+    >>> # Initializing a GraniteSpeechCTCEncoder (with random weights)
+    >>> model = GraniteSpeechCTCEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granite_speech_encoder"
+
+    def __init__(
+        self,
+        input_dim=160,
+        num_layers=10,
+        hidden_dim=1024,
+        feedforward_mult=4,
+        num_heads=8,
+        dim_head=128,
+        output_dim=42,
+        context_size=200,
+        max_pos_emb=512,
+        dropout=0.1,
+        conv_kernel_size=15,
+        conv_expansion_factor=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.feedforward_mult = feedforward_mult
+        self.num_heads = num_heads
+        self.dim_head = dim_head
+        self.output_dim = output_dim
+        self.context_size = context_size
+        self.dropout = dropout
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_expansion_factor = conv_expansion_factor
+        self.max_pos_emb = max_pos_emb
+
+
+class GraniteSpeechConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteSpeechForConditionalGeneration`]. It is used to instantiate an
+    Granite Speech model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `GraniteConfig`):
+            The config object or dictionary of the text backbone.
+        encoder_config (`GraniteSpeechEncoderConfig`, *optional*):
+            The config object or dictionary of the Granite Speech CTC Encoder.
+        projector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Blip2QFormerConfig`):
+            The config object or dictionary of the audio projector.
+        audio_token_index (`int`, *optional*, defaults to 49155):
+            The audio token index to encode the audio prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        has_lora_adapter (`bool`, *optional*, defaults to `True`):
+            Indicates whether or not the model has a lora adapter that should only
+            be activate when processing audio inputs.
+        downsample_rate (`int`, *optional*, defaults to 5):
+            Downsample rate for the audio feature extractor.
+        window_size (`int`, *optional*, defaults to 15):
+            Window size for the audio feature projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import GraniteSpeechConfig, GraniteSpeechForConditionalGeneration
+
+    >>> # Initializing a GraniteSpeechConfig
+    >>> configuration = GraniteSpeechConfig()
+
+    >>> # Initializing a GraniteSpeechForConditionalGeneration (with random weights)
+    >>> model = GraniteSpeechForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granite_speech"
+    attribute_map = {
+        "audio_token_id": "audio_token_index",
+    }
+    sub_configs = {
+        "text_config": AutoConfig,
+        "encoder_config": GraniteSpeechEncoderConfig,
+        "projector_config": AutoConfig,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        encoder_config=None,
+        projector_config=None,
+        audio_token_index=49155,
+        initializer_range=0.02,
+        has_lora_adapter=True,
+        downsample_rate=5,
+        window_size=15,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "granite")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["granite"]()
+
+        if isinstance(projector_config, dict):
+            projector_config["model_type"] = projector_config.get("model_type", "blip_2_qformer")
+            projector_config = CONFIG_MAPPING[projector_config["model_type"]](**projector_config)
+        elif projector_config is None:
+            projector_config = CONFIG_MAPPING["blip_2_qformer"]()
+
+        if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
+            encoder_config = {} if encoder_config is None else encoder_config
+            encoder_config = GraniteSpeechEncoderConfig(**encoder_config)
+
+        self.text_config = text_config
+        self.encoder_config = encoder_config
+        self.projector_config = projector_config
+        self.audio_token_index = audio_token_index
+        self.initializer_range = initializer_range
+        self.has_lora_adapter = has_lora_adapter
+        self.downsample_rate = downsample_rate
+        self.window_size = window_size
+        super().__init__(**kwargs)
+
+
+__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/feature_extraction_granite_speech.py b/phivenv/Lib/site-packages/transformers/models/granite_speech/feature_extraction_granite_speech.py
new file mode 100644
index 0000000000000000000000000000000000000000..7528fc7ea5bd9efa6ae322d7fd2e40b567855359
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Granite Speech."""
+
+import math
+from collections.abc import Sequence
+from typing import Optional
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...tokenization_utils_base import AudioInput
+from ...utils import is_torch_available, is_torchaudio_available, logging
+from ...utils.import_utils import requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+if is_torchaudio_available():
+    import torchaudio
+
+
+class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
+    model_input_names = ["input_features"]
+
+    def __init__(
+        self,
+        sampling_rate: int = 16000,
+        n_fft: int = 512,
+        win_length: int = 400,
+        hop_length: int = 160,
+        n_mels: int = 80,
+        projector_window_size: int = 15,
+        projector_downsample_rate: int = 5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.sampling_rate = sampling_rate
+        self.melspec_kwargs = {
+            "sample_rate": sampling_rate,
+            "n_fft": n_fft,
+            "win_length": win_length,
+            "hop_length": hop_length,
+            "n_mels": n_mels,
+        }
+        requires_backends(self, ["torchaudio"])
+        self.mel_filters = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
+        self.projector_window_size = projector_window_size
+        self.projector_downsample_rate = projector_downsample_rate
+
+    def __call__(
+        self,
+        audios: AudioInput,
+        device: Optional[str] = "cpu",
+    ) -> BatchFeature:
+        requires_backends(self, ["torchaudio"])
+
+        speech_inputs = {}
+        batched_audio, audio_lengths = self._get_audios_and_audio_lengths(audios)
+        speech_inputs["input_features"] = self._extract_mel_spectrograms(
+            batched_audio,
+            device=device,
+        )
+        audio_embed_sizes = self._get_num_audio_features(audio_lengths)
+        speech_inputs["audio_embed_sizes"] = audio_embed_sizes
+        # TODO (@alex-jw-brooks): Currently input_features_mask is not
+        # a great name, because input_features and input_features_mask
+        # have different shapes (before/after the projector).
+        #
+        # We should align this with other multimodal models, e.g,. llava
+        # and qwen2audio and refactor this to ensure input_feature_mask
+        # has the same dimensionality as input_features, or compute it in
+        # the model based on the audio embedding sizes (since we do not
+        # have an attention mask for the audio features to infer padding from).
+        speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor(
+            audio_embed_sizes
+        ).view(-1, 1)
+        return BatchFeature(data=speech_inputs)
+
+    def _extract_mel_spectrograms(self, audio: "torch.Tensor", device="cpu"):
+        """
+        Compute the Mel features to be passed to the conformer encoder.
+        """
+        requires_backends(self, ["torchaudio"])
+        if device is not None:
+            melspec = self.mel_filters.to(device)
+            audio = audio.to(device)
+        else:
+            melspec = self.mel_filters
+
+        bsz = audio.shape[0]
+        with torch.no_grad():
+            # Compute mel features
+            mel = melspec(audio.float())
+            logmel = mel.transpose(-1, -2).clip_(min=1e-10).log10_()
+            mx = logmel.amax(dim=(-2, -1), keepdim=True)
+            logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1)
+            # remove last frame if odd
+            if logmel.shape[1] % 2 == 1:
+                logmel = logmel[:, :-1]
+
+            # stacking and skipping by 2
+            audio = logmel.reshape(bsz, -1, 2 * logmel.shape[-1])
+
+        return audio
+
+    def _get_num_audio_features(self, audio_lengths: Sequence[int]) -> Sequence[int]:
+        """
+        Gets the (variable length) number of features (i.e., projector output) for the sequences
+        being considered.
+
+        Args:
+            audio_lengths (`Sequence[int]`):
+                Sequence of one or more raw audio lengths.
+        """
+        hop_length = self.melspec_kwargs["hop_length"]
+        effective_window_size = self.projector_window_size // self.projector_downsample_rate
+
+        projector_lengths = []
+        for raw_length in audio_lengths:
+            # mel sequence length computation
+            mel_length = raw_length // hop_length + 1
+            # encoder frame takes two mel features
+            encoder_length = mel_length // 2
+            nblocks = math.ceil(encoder_length / self.projector_window_size)
+            # projector output length
+            projector_length = nblocks * effective_window_size
+            projector_lengths.append(projector_length)
+
+        return projector_lengths
+
+    def _get_audios_and_audio_lengths(self, audios: AudioInput) -> Sequence["torch.Tensor", Sequence[int]]:
+        """
+        Coerces audio inputs to torch tensors and extracts audio lengths prior to stacking.
+
+        Args:
+            audios (`AudioInput`):
+                Audio sequence, numpy array, or torch tensor.
+        """
+        requires_backends(self, ["torch"])
+
+        # Coerce to PyTorch tensors if we have numpy arrays, since
+        # currently we have a dependency on torch/torchaudio anyway
+        if isinstance(audios, np.ndarray):
+            audios = torch.from_numpy(audios)
+        elif isinstance(audios, Sequence) and isinstance(audios[0], np.ndarray):
+            audios = [torch.from_numpy(arr) for arr in audios]
+
+        if isinstance(audios, torch.Tensor):
+            if audios.ndim == 1:
+                audios = audios.unsqueeze(0)
+            if not torch.is_floating_point(audios):
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
+
+            if audios.shape[0] > 1:
+                logger.warning("Audio samples are already collated; assuming they all have the same length")
+            lengths = [audios.shape[-1]] * audios.shape[0]
+            return audios, lengths
+
+        elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
+            if not torch.is_floating_point(audios[0]):
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
+            lengths = [audio.shape[-1] for audio in audios]
+            audios = [audio.squeeze(0) for audio in audios]
+            audios = torch.nn.utils.rnn.pad_sequence(audios, batch_first=True, padding_value=0.0)
+            return audios, lengths
+
+        raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays")
+
+
+__all__ = ["GraniteSpeechFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/modeling_granite_speech.py b/phivenv/Lib/site-packages/transformers/models/granite_speech/modeling_granite_speech.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8567916751e6718f3f6066f9b599f106045daf5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite_speech/modeling_granite_speech.py
@@ -0,0 +1,578 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, is_peft_available, logging
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_granite_speech import GraniteSpeechConfig, GraniteSpeechEncoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for LlavaNext causal language model (or autoregressive) outputs.
+    """
+)
+class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+### Projector
+class GraniteSpeechEncoderProjector(nn.Module):
+    def __init__(self, config: GraniteSpeechConfig):
+        super().__init__()
+        self.hidden_size = config.projector_config.hidden_size
+        self.downsample_rate = config.downsample_rate
+        self.window_size = config.window_size
+        self.num_queries = config.window_size // config.downsample_rate
+
+        self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.projector_config.hidden_size))
+        self.query.data.normal_(mean=0.0, std=1.0)
+
+        # By default, this will be a blip_2_qformer config
+        self.qformer = AutoModel.from_config(config.projector_config)
+        self.linear = nn.Linear(config.projector_config.hidden_size, config.text_config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = hidden_states.size()
+        nblocks = math.ceil(seq_len / self.window_size)
+        pad = nblocks * self.window_size - seq_len
+        hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad), "constant", 0)
+        hidden_states = hidden_states.view(batch_size * nblocks, self.window_size, dim)
+
+        query_output = self.qformer(
+            query_embeds=self.query,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=None,
+            return_dict=True,
+        )
+        query_proj = self.linear(
+            query_output.last_hidden_state.view(batch_size, nblocks * self.window_size // self.downsample_rate, -1)
+        )
+        return query_proj
+
+
+### Encoder - conformer is adapted from: https://github.com/lucidrains/conformer.git
+class GraniteSpeechConformerFeedForward(nn.Module):
+    """Feedforward module for conformer encoder blocks."""
+
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
+        self.up_proj = nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult)
+        self.silu = nn.SiLU()
+        self.dropout = nn.Dropout(config.dropout)
+        self.down_proj = nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        hidden_states = self.up_proj(hidden_states)
+        hidden_states = self.dropout(self.silu(hidden_states))
+        hidden_states = self.down_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechConformerAttention(nn.Module):
+    """Attention for conformer blocks using Shaw's relative positional embeddings.
+    See the following [paper](https://huggingface.co/papers/1803.02155) for more details.
+    """
+
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__()
+
+        inner_dim = config.dim_head * config.num_heads
+        self.max_pos_emb = config.max_pos_emb
+        self.context_size = config.context_size
+        self.num_heads = config.num_heads
+        self.dim_head = config.dim_head
+        self.scale = self.dim_head**-0.5
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
+        self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, config.hidden_dim)
+        self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head)
+        self.dropout = nn.Dropout(config.dropout)
+
+        if self.context_size <= 0 or self.context_size > self.max_pos_emb:
+            raise ValueError("Context size is either less than 0 or exceeds the max_pos_emb")
+
+    def forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        bsz, num_features, _ = hidden_states.shape
+
+        num_blocks = math.ceil(num_features / self.context_size)
+        remainder = num_features % self.context_size
+        if remainder > 0:
+            # right padding to reach block size
+            hidden_states = torch.nn.functional.pad(hidden_states, (0, 0, 0, self.context_size - remainder))
+
+        query_states = self.to_q(hidden_states)
+        key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
+
+        query_states = query_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
+        key_states = key_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
+        value_states = value_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
+
+        # shaw's relative positional embedding
+        rel_pos_emb = self.rel_pos_emb(attention_dists)
+        # alternative computation of `pos_attn` - for readability
+        # rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape))
+        # pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale
+        # einsum implementation of pos_attn - gives x30 speedup over the alternative
+        # TODO (@avihu111) find a fast alternative to einsum
+        pos_attn = torch.einsum("b m h c d, c r d -> b m h c r", query_states, rel_pos_emb) * self.scale
+
+        if remainder > 0:
+            # masked attention in the extended block
+            mask = torch.ones(self.context_size, self.context_size, dtype=bool, device=hidden_states.device)
+            mask[:remainder, :remainder] = 0
+            mask_value = -torch.finfo(pos_attn.dtype).max
+            pos_attn[:, -1, :].masked_fill_(mask, mask_value)
+
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
+            out = F.scaled_dot_product_attention(
+                query_states, key_states, value_states, attn_mask=pos_attn, scale=self.scale
+            )
+        out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1)
+        out = self.to_out(out[:, :num_features, :])
+        return self.dropout(out)
+
+
+class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
+    """Wrapper for padded 1D pointwise convolution."""
+
+    def __init__(self, chan_in: int, chan_out: int, kernel_size: int):
+        super().__init__()
+        # Padding for the 1D conv is symmetric or close (i.e., offset by one).
+        pad = kernel_size // 2
+        pad_offset = (kernel_size + 1) % 2
+        self.padding = (pad, pad - pad_offset)
+
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, self.padding)
+        return self.conv(hidden_states)
+
+
+class GraniteSpeechConformerConvModule(nn.Module):
+    """Conformer conv module consisting of several 1D/depthwise 1D convolutional layers."""
+
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__()
+        inner_dim = config.hidden_dim * config.conv_expansion_factor
+
+        self.norm = nn.LayerNorm(config.hidden_dim)
+        self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1)
+        self.glu = nn.GLU(dim=1)
+        self.depth_conv = GraniteSpeechConformerDepthWiseConv1d(
+            inner_dim,
+            inner_dim,
+            kernel_size=config.conv_kernel_size,
+        )
+        self.silu = nn.SiLU()
+        self.batch_norm = nn.BatchNorm1d(inner_dim)
+        self.down_conv = nn.Conv1d(inner_dim, config.hidden_dim, 1)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.up_conv(hidden_states.permute(0, 2, 1))
+        hidden_states = self.glu(hidden_states)
+        hidden_states = self.depth_conv(hidden_states)
+        hidden_states = self.silu(self.batch_norm(hidden_states))
+        hidden_states = self.down_conv(hidden_states).permute(0, 2, 1)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechConformerBlock(nn.Module):
+    """Conformer block, consisting largely of linear layers, attention, and convolutional layers."""
+
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__()
+        self.ff1 = GraniteSpeechConformerFeedForward(config)
+        self.attn = GraniteSpeechConformerAttention(config)
+        self.conv = GraniteSpeechConformerConvModule(config)
+        self.ff2 = GraniteSpeechConformerFeedForward(config)
+        self.post_norm = nn.LayerNorm(config.hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) -> torch.Tensor:
+        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
+        hidden_states = self.attn(hidden_states, attention_dists=attention_dists) + hidden_states
+        hidden_states = self.conv(hidden_states) + hidden_states
+        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
+        hidden_states = self.post_norm(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechCTCEncoder(nn.Module):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__()
+        self.config = config
+
+        # Precompute clamped relative positional encoding distances
+        seq = torch.arange(config.context_size)
+        relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
+        attention_dists = torch.clamp(relpos_dist, -config.context_size, config.context_size) + config.max_pos_emb
+        self.register_buffer("attention_dists", attention_dists, persistent=False)
+        self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
+        self.layers = nn.ModuleList([GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)])
+
+        self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
+        self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
+        self.num_layers = config.num_layers
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.input_linear(hidden_states)
+        for idx, layer in enumerate(self.layers, start=1):
+            hidden_states = layer(hidden_states, attention_dists=self.attention_dists)
+
+            if idx == self.num_layers // 2:
+                hidden_states_mid = hidden_states.clone()
+                hidden_states_mid = self.out(hidden_states_mid)
+                hidden_states += self.out_mid(nn.Softmax(dim=-1)(hidden_states_mid))
+        return hidden_states
+
+
+@auto_docstring
+class GraniteSpeechPreTrainedModel(PreTrainedModel):
+    config: GraniteSpeechConfig
+
+    _supports_flash_attn = False  # `blip_2_qformer` dependency does not allow for this
+    _supports_sdpa = True
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        std = self.config.initializer_range
+
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, GraniteSpeechEncoderProjector):
+            module.query.data.normal_()
+
+
+@auto_docstring(
+    custom_intro="""
+    The Granite Speech model, which consists of an audio encoder, projector, and language model.
+    """
+)
+class GraniteSpeechForConditionalGeneration(GraniteSpeechPreTrainedModel, GenerationMixin):
+    def __init__(self, config: GraniteSpeechConfig):
+        super().__init__(config)
+        # NOTE: It doesn't matter when we initialize from config, but we should be careful
+        # to make sure this does not pick up the adapter_config if in the future we use
+        # from_pretrained or something similar, since that should be set by the composite
+        # model; don't need to consider it twice
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
+        self.encoder = GraniteSpeechCTCEncoder(config.encoder_config)
+        self.projector = GraniteSpeechEncoderProjector(config)
+
+        if config.has_lora_adapter and not is_peft_available():
+            logger.warning(
+                "Config indicates that a lora adapter should be present, but "
+                "peft is not installed; this will cause the model to perform "
+                "incorrectly when audio inputs are provided. Please install "
+                "peft and reload the model!"
+            )
+
+        self.post_init()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def get_audio_features(self, input_features: torch.Tensor) -> torch.Tensor:
+        """Get the audio features to merged into the multimodal embeddings."""
+        encoder_embeds = self.encoder(input_features)
+        projected_embeds = self.projector(encoder_embeds)
+        return projected_embeds
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        input_features: torch.FloatTensor = None,
+        input_features_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple[torch.Tensor], GraniteSpeechCausalLMOutputWithPast]:
+        r"""
+        input_features_mask (`torch.Tensor`, *optional*):
+            Mask to be applied to audio features prior to scattering into the language embeddings.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        # TODO (@alex-jw-brooks) add an example to this docstring once models are released
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if input_features is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_features and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            # Get the base embeddings; set all audio tokens to 0 index
+            # to avoid out of vocabulary issues with the LLM embedding.
+            # Audio features will be masked into is_audio_idx indices later.
+            is_audio_idx = input_ids == self.config.audio_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[is_audio_idx] = 0
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if input_features is not None:
+            if input_features.dtype != self.dtype:
+                input_features = input_features.to(self.dtype)
+            # Get the audio features from the encoder / projector
+            audio_embeds = self.get_audio_features(input_features)
+
+            # Merge the audio features into the LLM embeddings
+            inputs_embeds = self.get_merged_audio_embeddings(
+                input_ids=input_ids,
+                audio_features=audio_embeds,
+                input_features_mask=input_features_mask,
+            )
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return GraniteSpeechCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        input_features=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward audio inputs to the model
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, input_features should be None because
+        # input ids do not contain special audio token anymore Otherwise we need
+        # input feature values to be passed to the model
+        if cache_position[0] == 0:
+            model_inputs["input_features"] = input_features
+        return model_inputs
+
+    def get_merged_audio_embeddings(
+        self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Adds the audio token to the model's LLM vocabulary so that we can pass it
+        through the tokenizer; it's assumed that the embeddings corresponding to the
+        <|audio|> token will be clobbered with speech features.
+
+        Args:
+            input_ids (`torch.Tensor`):
+                Input IDs containing one or more audio tokens.
+            audio_features (`torch.Tensor`):
+                Audio features to be masked into the language embeddings to form multimodal embeddings.
+            input_features_mask (`torch.Tensor`, *optional*, defaults to `None`)
+                Mask to be applied to audio features prior to scattering into the language embeddings.
+        """
+        is_audio_index = input_ids == self.config.audio_token_id
+        llm_input_ids = torch.where(is_audio_index, 0, input_ids)
+        inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids)  # [bsz, # features, hidden size]
+
+        # Mask the audio features into the text embeddings
+        special_audio_mask = is_audio_index.unsqueeze(-1)
+        audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        if input_features_mask is not None:
+            if torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)).item():
+                raise ValueError("Number of audio tokens does not match number of audio features")
+
+            audio_features = audio_features[input_features_mask]
+
+        inputs_embeds = inputs_embeds.masked_scatter(
+            special_audio_mask,
+            audio_features,
+        )
+        return inputs_embeds
+
+    def generate(self, *args, **kwargs) -> torch.LongTensor:
+        # This model is expected to have a lora adapter, which is only
+        # enabled when considering audio inputs. As such, we override generate
+        # to conditionally enable / disable the lora adapter based on whether
+        # or not any input features were provided.
+
+        input_features = kwargs.pop("input_features", None)
+        if is_peft_available and self._hf_peft_config_loaded:
+            if input_features is not None:
+                self.enable_adapters()
+            else:
+                self.disable_adapters()
+        return super().generate(*args, input_features=input_features, **kwargs)
+
+    def save_pretrained(self, save_directory, *args, **kwargs):
+        # overwrite save_pretrained to first save the adapter if we have one
+        if is_peft_available and self._hf_peft_config_loaded:
+            adapter_name = self._get_adapter_name()
+            self.peft_config[adapter_name].base_model_name_or_path = save_directory
+            super().save_pretrained(save_directory, *args, **kwargs)
+        # Then save the base model afterwards
+        prev_val = self._hf_peft_config_loaded
+        self._hf_peft_config_loaded = False
+        super().save_pretrained(save_directory, *args, **kwargs)
+        self._hf_peft_config_loaded = prev_val
+
+    @staticmethod
+    def _fix_state_dict_key_on_save(key) -> tuple[str, bool]:
+        # save the model with the original weights format
+        return key.replace(".base_layer", ""), False
+
+    def _fix_state_dict_keys_on_save(self, state_dict):
+        if is_peft_available and self._hf_peft_config_loaded:
+            # state dict is only adapter, should keep the same
+            return state_dict
+        # rename back the base model state dict
+        return {
+            self._fix_state_dict_key_on_save(key)[0]: value for key, value in state_dict.items() if ".lora_" not in key
+        }
+
+    def _get_adapter_name(self):
+        return list(self.peft_config.keys())[0]
+
+
+__all__ = [
+    "GraniteSpeechCTCEncoder",
+    "GraniteSpeechForConditionalGeneration",
+    "GraniteSpeechPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/granite_speech/processing_granite_speech.py b/phivenv/Lib/site-packages/transformers/models/granite_speech/processing_granite_speech.py
new file mode 100644
index 0000000000000000000000000000000000000000..84515d173c471198b987081198aeeed9415252c9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granite_speech/processing_granite_speech.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for Granite Speech."""
+
+from typing import Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils import PreTokenizedInput, TextInput
+from ...utils import is_torch_available, logging
+from ...utils.import_utils import requires_backends
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteSpeechProcessor(ProcessorMixin):
+    attributes = ["audio_processor", "tokenizer"]
+    audio_processor_class = "GraniteSpeechFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        audio_processor,
+        tokenizer,
+        audio_token="<|audio|>",
+        chat_template=None,
+    ):
+        self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
+        super().__init__(audio_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        audio: Union["torch.Tensor", list["torch.Tensor"]] = None,
+        device: str = "cpu",
+        images=None,
+        videos=None,
+        **kwargs,
+    ) -> BatchFeature:
+        requires_backends(self, ["torch"])
+
+        text = self._get_validated_text(text)
+        prompt_strings = text
+
+        if audio is not None:
+            # NOTE - we intentionally avoid throwing for potentially misaligned
+            # text / audio inputs here because some inference engines will
+            # trigger the conditions due to the way they call multimodal
+            # processors, e.g., vLLM.
+            audio_inputs = self.audio_processor(audio, device=device)
+
+            # TODO (@alex-jw-brooks); we should add a util to get_num_audio_tokens
+            # from feature lengths and call it here, rather than returning it
+            # from the feature extractor.
+            audio_embed_sizes = audio_inputs.pop("audio_embed_sizes")
+
+            # Expand the audio placeholders to match the feature dims; this
+            # is similar to how many VLMs handle image tokens, e.g., llava next
+            prompt_strings = []
+            num_replaced = 0
+            for sample in text:
+                while self.audio_token in sample:
+                    sample = sample.replace(
+                        self.audio_token,
+                        "<placeholder>" * audio_embed_sizes[num_replaced],
+                        1,
+                    )
+                    num_replaced += 1
+                prompt_strings.append(sample)
+
+            prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
+        else:
+            audio_inputs = {}
+
+        if "padding" not in kwargs:
+            kwargs["padding"] = True
+        text_inputs = self.tokenizer(prompt_strings, **kwargs)
+        return BatchFeature(data={**text_inputs, **audio_inputs})
+
+    def _get_validated_text(self, text: Union[str, list]) -> list[str]:
+        if isinstance(text, str):
+            return [text]
+        elif isinstance(text, list) and isinstance(text[0], str):
+            return text
+        raise TypeError("Invalid text provided! Text should be a string or list of strings.")
+
+
+__all__ = ["GraniteSpeechProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoe/__init__.py b/phivenv/Lib/site-packages/transformers/models/granitemoe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c85333f70b5ee330c9798da1ad624c44c65fc8d4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granitemoe import *
+    from .modeling_granitemoe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12a43dbbfaea5b2155b783de38afcdaffe7223d4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d2932c040c0c790a08b04e9565fce4cb12f3f40
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..613a97ccfab085963ec6560b87081a6877b5b51a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoe/configuration_granitemoe.py b/phivenv/Lib/site-packages/transformers/models/granitemoe/configuration_granitemoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb8dbe16f7d322c1a6823dffebd3bf8a9d568b0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoe/configuration_granitemoe.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraniteMoe model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteMoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteMoeModel`]. It is used to instantiate an GraniteMoe
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the GraniteMoe-3B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the GraniteMoe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GraniteMoeModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier
+        num_local_experts (`int`, *optional*, defaults to 8): total number of experts
+        num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxiliary loss coefficient
+
+    ```python
+    >>> from transformers import GraniteMoeModel, GraniteMoeConfig
+
+    >>> # Initializing a GraniteMoe granitemoe-3b style configuration
+    >>> configuration = GraniteMoeConfig()
+
+    >>> # Initializing a model from the granitemoe-7b style configuration
+    >>> model = GraniteMoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granitemoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        num_local_experts=8,
+        num_experts_per_tok=2,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # this model has rope embedding type, hardcoded for BC
+        self.position_embedding_type = "rope"
+
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+
+        self.num_local_experts = num_local_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        rope_config_validation(self)
+
+
+__all__ = ["GraniteMoeConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoe/modeling_granitemoe.py b/phivenv/Lib/site-packages/transformers/models/granitemoe/modeling_granitemoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f864395ccb67a005e17c1670bd66bd97697d76d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoe/modeling_granitemoe.py
@@ -0,0 +1,1003 @@
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_granitemoe import GraniteMoeConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, routing_weights.shape[1]))
+            .reshape(-1, routing_weights.shape[1])
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
+    overall_loss = torch.sum(
+        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
+    )
+    return overall_loss * num_experts
+
+
+# Copied from transformers.models.granite.modeling_granite.GraniteRMSNorm with Granite->GraniteMoe
+class GraniteMoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GraniteMoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# Copied from transformers.models.granite.modeling_granite.GraniteRotaryEmbedding with Granite->GraniteMoe
+class GraniteMoeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GraniteMoeConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.granite.modeling_granite.rotate_half with Granite->GraniteMoe
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.granite.modeling_granite.apply_rotary_pos_emb with Granite->GraniteMoe
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.jetmoe.modeling_jetmoe.JetMoeParallelExperts with JetMoe->GraniteMoe
+class GraniteMoeParallelExperts(nn.Module):
+    def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
+        """
+        Initialize the GraniteMoeParallelExperts module.
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
+        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
+        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
+        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
+        used in vllm.
+
+        Args:
+            num_experts (int):
+                Number of experts.
+            input_size (int):
+                Size of the input.
+            output_size (int):
+                Size of the output.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+
+    def forward(self, inputs, expert_size):
+        """
+        Forward pass of the GraniteMoeParallelExperts module.
+
+        Args:
+            inputs (Tensor):
+                Input tensor.
+            expert_size:
+                Expert size information.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+        for i in range(self.num_experts):
+            output_list.append(F.linear(input_list[i], self.weight[i]))
+        results = torch.cat(output_list, dim=0)
+        return results
+
+
+# Copied from transformers.models.jetmoe.modeling_jetmoe.JetMoeTopKGating with JetMoe->GraniteMoe
+class GraniteMoeTopKGating(nn.Module):
+    def __init__(self, input_size: int, num_experts: int, top_k: int):
+        """
+        Initialize the top-k gating mechanism.
+        Args:
+            input_size (`int`):
+                Size of the input.
+            num_experts (`int`):
+                Number of experts.
+            top_k (`int`):
+                Number of top experts to select.
+        """
+        super().__init__()
+
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.top_k = top_k
+
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+
+    def forward(self, hidden_states):
+        # compute the top_k routing decision
+        logits = self.layer(hidden_states).float()  # [batch_size x seq_len, num_experts]
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)  # [num_tokens, top_k]
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(hidden_states)  # [num_tokens, top_k]
+
+        # compute number of input given to each expert
+        zeros = torch.zeros(
+            [top_k_gates.size(0), self.num_experts], dtype=top_k_gates.dtype, device=top_k_gates.device
+        )  # [num_tokens, num_experts]
+        gates = zeros.scatter(1, top_k_indices, 1)  # [num_tokens, num_experts]
+        expert_size = gates.long().sum(0)  # [num_experts,]
+        # (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
+        # (and `DataDependentOutputException`)
+        expert_size = expert_size.tolist()
+
+        # sort and group input tokens according to expert assignment
+        top_k_experts = top_k_indices.flatten()  # [num_tokens * top_k]
+        _, index_sorted_experts = top_k_experts.sort(0)  # [num_tokens * top_k]
+        batch_index = index_sorted_experts.div(self.top_k, rounding_mode="trunc")  # [num_tokens * top_k]
+
+        # gather the gate values for grouped input tokens
+        top_k_gates = top_k_gates.flatten()  # [num_tokens * top_k]
+        batch_gates = top_k_gates[index_sorted_experts]  # [num_tokens * top_k]
+
+        return index_sorted_experts, batch_index, batch_gates, expert_size, logits
+
+
+class GraniteMoeMoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeConfig):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = GraniteMoeParallelExperts(config.num_local_experts, self.input_size, self.hidden_size * 2)
+        self.output_linear = GraniteMoeParallelExperts(config.num_local_experts, self.hidden_size, self.input_size)
+
+        self.router = GraniteMoeTopKGating(
+            input_size=self.input_size,
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+        )
+
+    def forward(self, layer_input):
+        """
+        Forward pass of the mixture of experts layer.
+
+        Args:
+            layer_input (Tensor):
+                Input tensor.
+
+        Returns:
+            Tensor:
+                Output tensor.
+            Tensor:
+                Router logits.
+        """
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)
+        _, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+
+        expert_inputs = layer_input[batch_index]
+        hidden_states = self.input_linear(expert_inputs, expert_size)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        expert_outputs = self.output_linear(hidden_states, expert_size)
+
+        expert_outputs = expert_outputs * batch_gates[:, None]
+
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        return layer_output, router_logits
+
+
+# Copied from transformers.models.granite.modeling_granite.repeat_kv with Granite->GraniteMoe
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# copied from transformers.models.granite.modeling_granite.GraniteAttention with Granite->GraniteMoe
+# no longer copied after attention refactors
+class GraniteMoeAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteMoeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+
+        self.scaling = config.attention_multiplier
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # None or rope embeddings
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings if position_embeddings is not None else (None, None)
+        if position_embeddings is not None:
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GraniteMoeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GraniteMoeConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GraniteMoeAttention(config=config, layer_idx=layer_idx)
+        if config.num_local_experts > 0:
+            self.block_sparse_moe = GraniteMoeMoE(config)
+        self.input_layernorm = GraniteMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GraniteMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_router_logits: Optional[bool] = False,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+@auto_docstring
+class GraniteMoePreTrainedModel(PreTrainedModel):
+    config: GraniteMoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GraniteMoeDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, GraniteMoeParallelExperts):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+@auto_docstring
+class GraniteMoeModel(GraniteMoePreTrainedModel):
+    def __init__(self, config: GraniteMoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GraniteMoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GraniteMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        self.embedding_multiplier = config.embedding_multiplier
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        self.position_embedding_type = config.position_embedding_type
+        self.rotary_emb = GraniteMoeRotaryEmbedding(config) if self.position_embedding_type == "rope" else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        position_embeddings = None
+        # create position embeddings to be shared across the decoder layers
+        if self.rotary_emb is not None:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                output_router_logits=output_router_logits,
+                position_embeddings=position_embeddings,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns] if v is not None
+            )
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class GraniteMoeForCausalLM(GraniteMoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GraniteMoeConfig):
+        super().__init__(config)
+        self.model = GraniteMoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, MoeCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GraniteMoeForCausalLM
+
+        >>> model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        # Only compute necessary logits
+        hidden_states = outputs[0]
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits / self.config.logits_scaling
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+__all__ = ["GraniteMoeForCausalLM", "GraniteMoeModel", "GraniteMoePreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__init__.py b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4db962865ee962ab366b4f2e683002e120b2b8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__init__.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2025 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granitemoehybrid import *
+    from .modeling_granitemoehybrid import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c87c9c5c28340eb450bfb14a75183189c68bf53
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0d06b2f9d6bd7086993ddf4da5d0bd4c6a4ad0e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4129b0ceed884d035d57442985d949065ee3c569
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ea9319341cb6e940c3c0cfc81911d73abcefb1f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e779591046ea786a899103e4ce3dcd7dc41d424
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py
@@ -0,0 +1,256 @@
+# coding=utf-8
+# Copyright 2025 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraniteMoeHybrid model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteMoeHybridConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteMoeHybridConfig`]. It is used to
+    instantiate an GraniteMoeHybrid model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the GraniteMoeHybrid model. Defines the number of different tokens that
+            can be represented by the `inputs_ids` passed when calling [`GraniteMoeHybridModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+            Only relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier.
+        logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits.
+        residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier.
+        attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier.
+        num_local_experts (`int`, *optional*, defaults to 8): total number of experts.
+        num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxialiary loss coefficient
+        shared_intermediate_size (`int`, *optional*, defaults to 1024): intermediate size for shared experts.
+        position_embedding_type (`str`, *optional*): Positional embedding
+            type to be used; defaults to None. Allowed options: `[None, "rope"]`
+        layer_types (`List`, *optional*): list of strings to be used as layer types.
+            Allowed choices: "mamba", "attention".
+        mamba_n_heads (`int`, *optional*, defaults to 128):
+            The number of mamba heads used.
+        mamba_n_groups (`int`, *optional*, defaults to 1):
+            The number of the mamba groups used.
+        mamba_d_state (`int`, *optional*, defaults to 256):
+            The dimension the mamba latent state space.
+        mamba_d_head (`int`, *optional*, defaults to `"auto"`):
+            Head embedding dimension size.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            The chunks in which to break the sequence when doing prefill/training.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"])
+            of the mamba mixer block.
+    ```python
+    >>> from transformers import GraniteMoeHybridModel, GraniteMoeHybridConfig
+
+    >>> # Initializing a GraniteMoeHybrid config
+    >>> configuration = GraniteMoeHybridConfig()
+
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granitemoehybrid"
+    attribute_map = {
+        "layers_block_type": "layer_types",
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        num_local_experts=8,
+        num_experts_per_tok=2,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        shared_intermediate_size=1024,
+        position_embedding_type=None,
+        layer_types=None,
+        mamba_n_heads=128,
+        mamba_n_groups=1,
+        mamba_d_state=256,
+        mamba_d_head="auto",
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_chunk_size=256,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+        self.attention_dropout = attention_dropout
+        self.num_local_experts = num_local_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.shared_intermediate_size = shared_intermediate_size
+        self.position_embedding_type = position_embedding_type
+
+        mamba_intermediate = mamba_expand * hidden_size
+
+        if layer_types is not None and any(layer_type not in ["mamba", "attention"] for layer_type in layer_types):
+            raise ValueError("layer_types must be a list strings in  [`mamba` `attention`]")
+
+        if mamba_intermediate % mamba_n_heads != 0:
+            raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size")
+
+        # for the mamba_v2, must satisfy the following
+        if mamba_d_head == "auto":
+            mamba_d_head = mamba_intermediate // mamba_n_heads
+
+        if mamba_d_head * mamba_n_heads != mamba_intermediate:
+            raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size")
+
+        self.mamba_n_heads = mamba_n_heads
+        self.mamba_d_head = mamba_d_head
+        self.mamba_n_groups = mamba_n_groups
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_chunk_size = mamba_chunk_size
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.mamba_expand = mamba_expand
+        self.layer_types = layer_types
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        if self.position_embedding_type == "rope":
+            rope_config_validation(self)
+
+    # overwrite the function to use in `HybridMambaAttentionDynamicCache`
+    @property
+    def layers_block_type(self):
+        return self.layer_types if self.layer_types else ["mamba"] * self.num_hidden_layers
+
+
+__all__ = ["GraniteMoeHybridConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf2baedf809aa5f55e44b03dc027aad00e7a9f7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
@@ -0,0 +1,1837 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_granitemoehybrid.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Optional, TypedDict, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.activations import ACT2FN
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
+from .configuration_granitemoehybrid import GraniteMoeHybridConfig
+
+
+if is_mamba_2_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update = None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# copied from transformers.models.granite.modeling_granite.GraniteAttention with Granite->GraniteMoeHybrid
+# no longer copied after attention refactors
+class GraniteMoeHybridAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+
+        self.scaling = config.attention_multiplier
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # None or rope embeddings
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings if position_embeddings is not None else (None, None)
+        if position_embeddings is not None:
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class HybridMambaAttentionDynamicCache:
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    is_compileable = False
+
+    def __init__(self, config: GraniteMoeHybridConfig, batch_size, dtype=torch.float16, device=None):
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False  # only used by mamba
+        conv_kernel_size = config.mamba_d_conv
+        ssm_state_size = config.mamba_d_state
+
+        self.conv_states = []
+        self.ssm_states = []
+        self.transformer_layers = []
+        for i in range(config.num_hidden_layers):
+            if self.layers_block_type[i] == "mamba":
+                self.conv_states += [
+                    torch.zeros(
+                        batch_size,
+                        (config.mamba_expand * config.hidden_size + 2 * config.mamba_n_groups * ssm_state_size),
+                        conv_kernel_size,
+                        device=device,
+                        dtype=dtype,
+                    )
+                ]
+                self.ssm_states += [
+                    torch.zeros(
+                        batch_size,
+                        config.mamba_n_heads,
+                        config.mamba_d_head,
+                        ssm_state_size,
+                        device=device,
+                        dtype=dtype,
+                    )
+                ]
+            else:
+                self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
+                self.ssm_states += [torch.tensor([[]] * batch_size, device=device)]
+                self.transformer_layers.append(i)
+
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+
+# Helper methods for segment sum computation
+
+
+def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int):
+    """
+    Padding x tensor with `pad_size` on the seq_len dim (dim=1)
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0)
+
+    return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0)
+
+
+def reshape_into_chunks(input_tensor, pad_size, chunk_size):
+    """
+    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
+    simultaneously splitting it into chunk sequences.
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...]
+    input_tensor = pad_tensor_by_size(input_tensor, pad_size)
+
+    if len(input_tensor.shape) == 3:
+        # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads]
+        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
+    else:
+        # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size]
+        return input_tensor.reshape(
+            input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]
+        )
+
+
+def segment_sum(input_tensor):
+    """
+    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
+    """
+    chunk_size = input_tensor.size(-1)
+    # 1. expand input tensor to have an additional dimension and repeat along that dimension
+    # [..., chunk_size] -> [..., chunk_size, chunk_size]
+    input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size)
+    # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1)
+    input_tensor = input_tensor.masked_fill(~mask, 0)
+    # 3. compute actual cumsum
+    tensor_segsum = torch.cumsum(input_tensor, dim=-2)
+
+    # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time)
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0)
+    tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf)
+    return tensor_segsum
+
+
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
+def apply_mask_to_padding_states(hidden_states, attention_mask):
+    """
+    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+    """
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+    return hidden_states
+
+
+# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
+class GraniteMoeHybridMambaLayer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+
+    The are a few differences between this and Mamba2Mixer:
+    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
+    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
+    - Some extra variables that our layer doesn't need have been removed
+    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
+    """
+
+    def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int):
+        super().__init__()
+        self.num_heads = config.mamba_n_heads
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = int(config.mamba_expand * self.hidden_size)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.mamba_conv_bias
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+        self.use_bias = config.mamba_proj_bias
+
+        self.layer_norm_epsilon = config.rms_norm_eps
+
+        self.n_groups = config.mamba_n_groups
+        self.head_dim = config.mamba_d_head
+        self.chunk_size = config.mamba_chunk_size
+
+        # FIXME:
+        self.time_step_limit = (0.0, float("inf"))
+        self.time_step_min = 0.001
+        self.time_step_max = 0.1
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=config.mamba_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.conv_dim,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=self.use_bias,
+        )
+        # selective projection used to make dt, B and C input dependent
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        self.norm = GraniteMoeHybridRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon)
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+        else:
+            logger.warning_once("The fast path for GraniteMoeHybrid will be used when running the model on a GPU")
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        seq_idx: Optional[torch.IntTensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+        projected_states = self.in_proj(hidden_states)
+
+        # Set up dimensions for reshapes later
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and cache_position is not None
+            and cache_position[0] > 0
+        )
+
+        # getting projected states from cache if it exists
+        if use_precomputed_states:
+            gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
+                [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+            )
+
+            # 2. Convolution sequence transformation
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+
+            # 3. SSM transformation
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            hidden_states = self.norm(hidden_states, gate)
+
+            # 4. Final linear projection
+            out = self.out_proj(hidden_states)[:, None, ...]
+        # Fused calculations or step by step if no initialized cache is found
+        else:
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
+
+            # 2-4. Fused kernel for conv1d, SSM, and the final projection
+            if self.training and cache_params is None:
+                out = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=seq_idx,
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.variance_epsilon,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=False,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                gate, hidden_states_B_C, dt = projected_states.split(
+                    [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+                )
+
+                # 2. Convolution sequence transformation
+                # Init cache
+                if cache_params is not None:
+                    # storing the states
+                    # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                    # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                    hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                    conv_states = nn.functional.pad(
+                        hidden_states_B_C_transposed,
+                        (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0),
+                    )
+                    cache_params.conv_states[self.layer_idx].copy_(conv_states)
+
+                if self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)
+                    )
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                        seq_idx=seq_idx,
+                    ).transpose(1, 2)
+
+                hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                    dim=-1,
+                )
+
+                # 3. SSM transformation
+                scan_output, ssm_state = mamba_chunk_scan_combined(
+                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                    dt,
+                    A,
+                    B.view(batch_size, seq_len, self.n_groups, -1),
+                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    chunk_size=self.chunk_size,
+                    D=self.D,
+                    z=None,
+                    seq_idx=seq_idx,
+                    return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
+                    **dt_limit_kwargs,
+                )
+
+                # Init cache
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                # Multiply "gate" branch and apply extra normalization layer
+                scan_output = self.norm(scan_output, gate)
+
+                # 4. Final linear projection
+                out = self.out_proj(scan_output)
+        return out
+
+    # fmt: off
+    def torch_forward(
+        self,
+        input_states,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+
+        # 1. Gated MLP's linear projection
+        input_states = apply_mask_to_padding_states(input_states, attention_mask)
+        projected_states = self.in_proj(input_states)
+        gate, hidden_states_B_C, dt = projected_states.split(
+                [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+        )
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and cache_position is not None
+            and cache_position[0] > 0
+        )
+
+        # 2. Convolution sequence transformation
+        if use_precomputed_states:
+            cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1)
+            cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device)
+
+            # We need to guarantee that anything regarding the cache is on the same device
+            conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device)
+
+            hidden_states_B_C = torch.sum(
+                conv_states * self.conv1d.weight.squeeze(1), dim=-1
+            )
+            if self.use_conv_bias:
+                hidden_states_B_C = hidden_states_B_C + self.conv1d.bias
+            hidden_states_B_C = self.act(hidden_states_B_C)
+        else:
+            # Init cache
+            if cache_params is not None:
+                hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                conv_states = nn.functional.pad(
+                    hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+
+            hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+
+        hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+        hidden_states, B, C = torch.split(
+            hidden_states_B_C,
+            [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size],
+            dim=-1
+        )
+
+        # 3. SSM transformation
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if use_precomputed_states:
+            # We need to guarantee that anything regarding the cache is on the same device
+            cache_device = cache_params.ssm_states[self.layer_idx].device
+
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = (torch.exp(dt[..., None] * A)).to(device=cache_device)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = (dB * hidden_states[..., None]).to(device=cache_device)
+
+            # State calculation
+            cache_params.ssm_states[self.layer_idx].copy_(
+                cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # Contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+            # Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3)
+
+            # 2. Compute the state for each intra-chunk
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+            decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+            B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
+            states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
+
+            # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+            # (middle term of factorization of off-diag blocks; A terms)
+            if use_precomputed_states:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device)
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+            decay_chunk = decay_chunk.transpose(1, 3)
+            new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # 4. Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+
+            # Init cache
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        scan_output = self.norm(y, gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        seq_idx: Optional[torch.IntTensor] = None,
+        **kwargs,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask, seq_idx)
+        if seq_idx is not None:
+            raise NotImplementedError(
+                "`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`"
+            )
+        dtype = hidden_states.dtype
+        if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+            # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+        return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+
+
+class GraniteMoeHybridRMSNormGated(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+
+        if gate is not None:
+            hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32))
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class GraniteMoeHybridMLP(nn.Module):
+    """
+    MLP layer for shared experts
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeHybridConfig):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = nn.Linear(self.input_size, self.hidden_size * 2, bias=False)
+        self.output_linear = nn.Linear(self.hidden_size, self.input_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.input_linear(hidden_states)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        hidden_states = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteFlashAttentionKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
+    Use cases include padding-free training and fewer `torch.compile` graph breaks.
+
+    Attributes:
+        cu_seq_lens_q (`torch.LongTensor`)
+            Gets cumulative sequence length for query state.
+        cu_seq_lens_k (`torch.LongTensor`)
+            Gets cumulative sequence length for key state.
+        max_length_q (`int`):
+            Maximum sequence length for query state.
+        max_length_k (`int`):
+            Maximum sequence length for key state.
+        seq_idx (`torch.IntTensor):
+            Index of each packed sequence.
+    """
+
+    cu_seq_lens_q: torch.LongTensor
+    cu_seq_lens_k: torch.LongTensor
+    max_length_q: int
+    max_length_k: int
+    seq_idx: torch.IntTensor
+
+
+class GraniteMoeHybridRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GraniteMoeHybridRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GraniteMoeHybridParallelExperts(nn.Module):
+    def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
+        """
+        Initialize the GraniteMoeHybridParallelExperts module.
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
+        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
+        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
+        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
+        used in vllm.
+
+        Args:
+            num_experts (int):
+                Number of experts.
+            input_size (int):
+                Size of the input.
+            output_size (int):
+                Size of the output.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+
+    def forward(self, inputs, expert_size):
+        """
+        Forward pass of the GraniteMoeHybridParallelExperts module.
+
+        Args:
+            inputs (Tensor):
+                Input tensor.
+            expert_size:
+                Expert size information.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+        for i in range(self.num_experts):
+            output_list.append(F.linear(input_list[i], self.weight[i]))
+        results = torch.cat(output_list, dim=0)
+        return results
+
+
+class GraniteMoeHybridTopKGating(nn.Module):
+    def __init__(self, input_size: int, num_experts: int, top_k: int):
+        """
+        Initialize the top-k gating mechanism.
+        Args:
+            input_size (`int`):
+                Size of the input.
+            num_experts (`int`):
+                Number of experts.
+            top_k (`int`):
+                Number of top experts to select.
+        """
+        super().__init__()
+
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.top_k = top_k
+
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+
+    def forward(self, hidden_states):
+        # compute the top_k routing decision
+        logits = self.layer(hidden_states).float()  # [batch_size x seq_len, num_experts]
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)  # [num_tokens, top_k]
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(hidden_states)  # [num_tokens, top_k]
+
+        # compute number of input given to each expert
+        zeros = torch.zeros(
+            [top_k_gates.size(0), self.num_experts], dtype=top_k_gates.dtype, device=top_k_gates.device
+        )  # [num_tokens, num_experts]
+        gates = zeros.scatter(1, top_k_indices, 1)  # [num_tokens, num_experts]
+        expert_size = gates.long().sum(0)  # [num_experts,]
+        # (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
+        # (and `DataDependentOutputException`)
+        expert_size = expert_size.tolist()
+
+        # sort and group input tokens according to expert assignment
+        top_k_experts = top_k_indices.flatten()  # [num_tokens * top_k]
+        _, index_sorted_experts = top_k_experts.sort(0)  # [num_tokens * top_k]
+        batch_index = index_sorted_experts.div(self.top_k, rounding_mode="trunc")  # [num_tokens * top_k]
+
+        # gather the gate values for grouped input tokens
+        top_k_gates = top_k_gates.flatten()  # [num_tokens * top_k]
+        batch_gates = top_k_gates[index_sorted_experts]  # [num_tokens * top_k]
+
+        return index_sorted_experts, batch_index, batch_gates, expert_size, logits
+
+
+class GraniteMoeHybridMoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeHybridConfig):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = GraniteMoeHybridParallelExperts(
+            config.num_local_experts, self.input_size, self.hidden_size * 2
+        )
+        self.output_linear = GraniteMoeHybridParallelExperts(
+            config.num_local_experts, self.hidden_size, self.input_size
+        )
+
+        self.router = GraniteMoeHybridTopKGating(
+            input_size=self.input_size,
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+        )
+
+    def forward(self, layer_input):
+        """
+        Forward pass of the mixture of experts layer.
+
+        Args:
+            layer_input (Tensor):
+                Input tensor.
+
+        Returns:
+            Tensor:
+                Output tensor.
+            Tensor:
+                Router logits.
+        """
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)
+        _, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+
+        expert_inputs = layer_input[batch_index]
+        hidden_states = self.input_linear(expert_inputs, expert_size)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        expert_outputs = self.output_linear(hidden_states, expert_size)
+
+        expert_outputs = expert_outputs * batch_gates[:, None]
+
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        return layer_output, router_logits
+
+
+class GraniteMoeHybridDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Either attention or mamba will be initialized, depending on the layer type.
+        self.self_attn = None
+        if config.num_local_experts > 0:
+            self.block_sparse_moe = GraniteMoeHybridMoE(config)
+        self.input_layernorm = GraniteMoeHybridRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GraniteMoeHybridRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+        self.shared_mlp = GraniteMoeHybridMLP(config)
+        self.mamba = None
+
+        if config.layers_block_type[layer_idx] == "mamba":
+            self.mamba = GraniteMoeHybridMambaLayer(config, layer_idx)
+        else:
+            self.self_attn = GraniteMoeHybridAttention(config, layer_idx)
+        self.layer_type = config.layers_block_type[layer_idx]
+
+        # Accept 0 experts: skip MoE if num_local_experts == 0
+        self.has_experts = getattr(config, "num_local_experts", 0) > 0
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_router_logits: Optional[bool] = False,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[GraniteFlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
+                padding-free training and/or improve torch.compile performance.
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        if self.mamba is not None:
+            hidden_states = self.mamba(
+                hidden_states=hidden_states,
+                cache_position=cache_position,
+                cache_params=past_key_values,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            # No attention weights for state space layers
+            self_attn_weights = None
+        else:
+            hidden_states, self_attn_weights = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        if self.has_experts:
+            moe_hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+        else:
+            hidden_states = self.shared_mlp(hidden_states)
+            router_logits = None
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+@auto_docstring
+class GraniteMoeHybridPreTrainedModel(PreTrainedModel):
+    config: GraniteMoeHybridConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GraniteMoeHybridDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, GraniteMoeHybridParallelExperts):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if isinstance(module, GraniteMoeHybridMambaLayer):
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1))
+            module.D.data.fill_(1.0)
+        elif isinstance(module, GraniteMoeHybridRMSNormGated):
+            module.weight.data.fill_(1.0)
+
+
+class GraniteMoeHybridRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GraniteMoeHybridConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class GraniteMoeHybridModel(GraniteMoeHybridPreTrainedModel):
+    def __init__(self, config: GraniteMoeHybridConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GraniteMoeHybridDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GraniteMoeHybridRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        self.embedding_multiplier = config.embedding_multiplier
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        self.position_embedding_type = config.position_embedding_type
+        self.rotary_emb = GraniteMoeHybridRotaryEmbedding(config) if self.position_embedding_type == "rope" else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[GraniteFlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier
+
+        ## overwritten because `HybridMambaAttentionDynamicCache` is needed
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "GraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. "
+                "Because one was not provided, no cache will be returned."
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        position_embeddings = None
+        # create position embeddings to be shared across the decoder layers
+        if self.rotary_emb is not None:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention)
+            layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=layer_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                output_router_logits=output_router_logits,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                if layer_outputs[-1] is not None:
+                    # append router logits only of expert layers. Regular MLP layers return `None` as the router logits
+                    all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+    def _update_mamba_mask(self, attention_mask, cache_position):
+        """
+        No need for zeroing states when
+            1. Cached forward
+            2. Attending to all inputs
+        """
+        mamba_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            mamba_mask = None
+        return mamba_mask
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, routing_weights.shape[1]))
+            .reshape(-1, routing_weights.shape[1])
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
+    overall_loss = torch.sum(
+        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
+    )
+    return overall_loss * num_experts
+
+
+class GraniteMoeHybridForCausalLM(GraniteMoeHybridPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GraniteMoeHybridConfig):
+        super().__init__(config)
+        self.model = GraniteMoeHybridModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, MoeCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM
+
+        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm/PowerMoE-3b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        # Only compute necessary logits
+        hidden_states = outputs[0]
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits / self.config.logits_scaling
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        elif use_cache:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config, input_ids.shape[0], self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+__all__ = ["GraniteMoeHybridForCausalLM", "GraniteMoeHybridModel", "GraniteMoeHybridPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..25151b6936b6c02a8e11b79d064f3d1b960695be
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
@@ -0,0 +1,389 @@
+# coding=utf-8
+# Copyright 2025 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...modeling_outputs import BaseModelOutputWithPast, MoeModelOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..bamba.configuration_bamba import BambaConfig
+from ..bamba.modeling_bamba import BambaMixer, BambaRMSNormGated, HybridMambaAttentionDynamicCache
+from ..granitemoeshared.modeling_granitemoeshared import (
+    GraniteFlashAttentionKwargs,
+    GraniteMoeSharedAttention,
+    GraniteMoeSharedDecoderLayer,
+    GraniteMoeSharedForCausalLM,
+    GraniteMoeSharedMLP,
+    GraniteMoeSharedModel,
+    GraniteMoeSharedPreTrainedModel,
+)
+from .configuration_granitemoehybrid import GraniteMoeHybridConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteMoeHybridAttention(GraniteMoeSharedAttention):
+    def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+
+class GraniteMoeHybridMambaLayer(BambaMixer):
+    def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int):
+        super().__init__(BambaConfig(config), layer_idx)
+
+
+class GraniteMoeHybridRMSNormGated(BambaRMSNormGated):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__(hidden_size, eps)
+
+
+class GraniteMoeHybridMLP(GraniteMoeSharedMLP):
+    def __init__(self, config: GraniteMoeHybridConfig):
+        super().__init__(config)
+
+
+class GraniteMoeHybridDecoderLayer(GraniteMoeSharedDecoderLayer):
+    def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.shared_mlp = GraniteMoeHybridMLP(config)
+        # Either attention or mamba will be initialized, depending on the layer type.
+        self.self_attn = None
+        self.mamba = None
+
+        if config.layers_block_type[layer_idx] == "mamba":
+            self.mamba = GraniteMoeHybridMambaLayer(config, layer_idx)
+        else:
+            self.self_attn = GraniteMoeHybridAttention(config, layer_idx)
+        self.layer_type = config.layers_block_type[layer_idx]
+
+        # Accept 0 experts: skip MoE if num_local_experts == 0
+        self.has_experts = getattr(config, "num_local_experts", 0) > 0
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_router_logits: Optional[bool] = False,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[GraniteFlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
+                padding-free training and/or improve torch.compile performance.
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        if self.mamba is not None:
+            hidden_states = self.mamba(
+                hidden_states=hidden_states,
+                cache_position=cache_position,
+                cache_params=past_key_values,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            # No attention weights for state space layers
+            self_attn_weights = None
+        else:
+            hidden_states, self_attn_weights = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        if self.has_experts:
+            moe_hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+        else:
+            hidden_states = self.shared_mlp(hidden_states)
+            router_logits = None
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class GraniteMoeHybridPreTrainedModel(GraniteMoeSharedPreTrainedModel):
+    config: GraniteMoeHybridConfig
+    _no_split_modules = ["GraniteMoeHybridDecoderLayer"]
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, GraniteMoeHybridMambaLayer):
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1))
+            module.D.data.fill_(1.0)
+        elif isinstance(module, GraniteMoeHybridRMSNormGated):
+            module.weight.data.fill_(1.0)
+
+
+class GraniteMoeHybridModel(GraniteMoeSharedModel):
+    def __init__(self, config: GraniteMoeHybridConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [GraniteMoeHybridDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[GraniteFlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier
+
+        ## overwritten because `HybridMambaAttentionDynamicCache` is needed
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "GraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. "
+                "Because one was not provided, no cache will be returned."
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        position_embeddings = None
+        # create position embeddings to be shared across the decoder layers
+        if self.rotary_emb is not None:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention)
+            layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=layer_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                output_router_logits=output_router_logits,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                if layer_outputs[-1] is not None:
+                    # append router logits only of expert layers. Regular MLP layers return `None` as the router logits
+                    all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    def _update_mamba_mask(self, attention_mask, cache_position):
+        """
+        No need for zeroing states when
+            1. Cached forward
+            2. Attending to all inputs
+        """
+        mamba_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            mamba_mask = None
+        return mamba_mask
+
+
+class GraniteMoeHybridForCausalLM(GraniteMoeSharedForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GraniteMoeHybridConfig):
+        super().__init__(config)
+        self.model = GraniteMoeHybridModel(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        elif use_cache:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config, input_ids.shape[0], self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+__all__ = ["GraniteMoeHybridForCausalLM", "GraniteMoeHybridModel", "GraniteMoeHybridPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__init__.py b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33d80cdd3425f95de5d40c82a4f52132be971f1f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granitemoeshared import *
+    from .modeling_granitemoeshared import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e4a8ed405cd842a9b38022fc88dc5b27f47a431
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30ea61b72fa9252ec43bd41a295c2eb979d7c6fa
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc2edc65f8022d747399b7366cdd0b8b19880420
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac20a626ce8311b3a660e4016fac9d3f199d59cf
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/configuration_granitemoeshared.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1c4a5ca6991bff87729670eac05a41e7879181
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/configuration_granitemoeshared.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraniteMoeShared model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteMoeSharedConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteMoeSharedModel`]. It is used to instantiate an GraniteMoeShared
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [ibm-research/moe-7b-1b-active-shared-experts](https://huggingface.co/ibm-research/moe-7b-1b-active-shared-experts).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the GraniteMoeShared model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GraniteMoeSharedModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier
+        num_local_experts (`int`, *optional*, defaults to 8): total number of experts
+        num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxiliary loss coefficient
+        shared_intermediate_size (`int`, *optional*, defaults to 0): intermediate size for shared experts. 0 implies
+            no shared experts.
+
+    ```python
+    >>> from transformers import GraniteMoeSharedModel, GraniteMoeSharedConfig
+
+    >>> # Initializing a GraniteMoeShared granitemoe-3b style configuration
+    >>> configuration = GraniteMoeSharedConfig()
+
+    >>> # Initializing a model from the granitemoe-7b style configuration
+    >>> model = GraniteMoeSharedModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granitemoeshared"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        num_local_experts=8,
+        num_experts_per_tok=2,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        shared_intermediate_size=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # this model has rope embedding type, hardcoded for BC
+        self.position_embedding_type = "rope"
+
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+
+        self.num_local_experts = num_local_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.shared_intermediate_size = shared_intermediate_size
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        rope_config_validation(self)
+
+
+__all__ = ["GraniteMoeSharedConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/modeling_granitemoeshared.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bbb7b768a101ab2aabab40e9ee3ccb1321a15d9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/modeling_granitemoeshared.py
@@ -0,0 +1,1063 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/granitemoeshared/modular_granitemoeshared.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_granitemoeshared.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, TypedDict, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_granitemoeshared import GraniteMoeSharedConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteFlashAttentionKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
+    Use cases include padding-free training and fewer `torch.compile` graph breaks.
+
+    Attributes:
+        cu_seq_lens_q (`torch.LongTensor`)
+            Gets cumulative sequence length for query state.
+        cu_seq_lens_k (`torch.LongTensor`)
+            Gets cumulative sequence length for key state.
+        max_length_q (`int`):
+            Maximum sequence length for query state.
+        max_length_k (`int`):
+            Maximum sequence length for key state.
+        seq_idx (`torch.IntTensor):
+            Index of each packed sequence.
+    """
+
+    cu_seq_lens_q: torch.LongTensor
+    cu_seq_lens_k: torch.LongTensor
+    max_length_q: int
+    max_length_k: int
+    seq_idx: torch.IntTensor
+
+
+class GraniteMoeSharedMLP(nn.Module):
+    """
+    MLP layer for shared experts
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = nn.Linear(self.input_size, self.hidden_size * 2, bias=False)
+        self.output_linear = nn.Linear(self.hidden_size, self.input_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.input_linear(hidden_states)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        hidden_states = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GraniteMoeSharedRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GraniteMoeSharedParallelExperts(nn.Module):
+    def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
+        """
+        Initialize the GraniteMoeSharedParallelExperts module.
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
+        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
+        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
+        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
+        used in vllm.
+
+        Args:
+            num_experts (int):
+                Number of experts.
+            input_size (int):
+                Size of the input.
+            output_size (int):
+                Size of the output.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+
+    def forward(self, inputs, expert_size):
+        """
+        Forward pass of the GraniteMoeSharedParallelExperts module.
+
+        Args:
+            inputs (Tensor):
+                Input tensor.
+            expert_size:
+                Expert size information.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+        for i in range(self.num_experts):
+            output_list.append(F.linear(input_list[i], self.weight[i]))
+        results = torch.cat(output_list, dim=0)
+        return results
+
+
+class GraniteMoeSharedTopKGating(nn.Module):
+    def __init__(self, input_size: int, num_experts: int, top_k: int):
+        """
+        Initialize the top-k gating mechanism.
+        Args:
+            input_size (`int`):
+                Size of the input.
+            num_experts (`int`):
+                Number of experts.
+            top_k (`int`):
+                Number of top experts to select.
+        """
+        super().__init__()
+
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.top_k = top_k
+
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+
+    def forward(self, hidden_states):
+        # compute the top_k routing decision
+        logits = self.layer(hidden_states).float()  # [batch_size x seq_len, num_experts]
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)  # [num_tokens, top_k]
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(hidden_states)  # [num_tokens, top_k]
+
+        # compute number of input given to each expert
+        zeros = torch.zeros(
+            [top_k_gates.size(0), self.num_experts], dtype=top_k_gates.dtype, device=top_k_gates.device
+        )  # [num_tokens, num_experts]
+        gates = zeros.scatter(1, top_k_indices, 1)  # [num_tokens, num_experts]
+        expert_size = gates.long().sum(0)  # [num_experts,]
+        # (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
+        # (and `DataDependentOutputException`)
+        expert_size = expert_size.tolist()
+
+        # sort and group input tokens according to expert assignment
+        top_k_experts = top_k_indices.flatten()  # [num_tokens * top_k]
+        _, index_sorted_experts = top_k_experts.sort(0)  # [num_tokens * top_k]
+        batch_index = index_sorted_experts.div(self.top_k, rounding_mode="trunc")  # [num_tokens * top_k]
+
+        # gather the gate values for grouped input tokens
+        top_k_gates = top_k_gates.flatten()  # [num_tokens * top_k]
+        batch_gates = top_k_gates[index_sorted_experts]  # [num_tokens * top_k]
+
+        return index_sorted_experts, batch_index, batch_gates, expert_size, logits
+
+
+class GraniteMoeSharedMoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = GraniteMoeSharedParallelExperts(
+            config.num_local_experts, self.input_size, self.hidden_size * 2
+        )
+        self.output_linear = GraniteMoeSharedParallelExperts(
+            config.num_local_experts, self.hidden_size, self.input_size
+        )
+
+        self.router = GraniteMoeSharedTopKGating(
+            input_size=self.input_size,
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+        )
+
+    def forward(self, layer_input):
+        """
+        Forward pass of the mixture of experts layer.
+
+        Args:
+            layer_input (Tensor):
+                Input tensor.
+
+        Returns:
+            Tensor:
+                Output tensor.
+            Tensor:
+                Router logits.
+        """
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)
+        _, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+
+        expert_inputs = layer_input[batch_index]
+        hidden_states = self.input_linear(expert_inputs, expert_size)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        expert_outputs = self.output_linear(hidden_states, expert_size)
+
+        expert_outputs = expert_outputs * batch_gates[:, None]
+
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        return layer_output, router_logits
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# copied from transformers.models.granite.modeling_granite.GraniteAttention with Granite->GraniteMoeShared
+# no longer copied after attention refactors
+class GraniteMoeSharedAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteMoeSharedConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+
+        self.scaling = config.attention_multiplier
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # None or rope embeddings
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings if position_embeddings is not None else (None, None)
+        if position_embeddings is not None:
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class GraniteMoeSharedDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GraniteMoeSharedConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GraniteMoeSharedAttention(config=config, layer_idx=layer_idx)
+        if config.num_local_experts > 0:
+            self.block_sparse_moe = GraniteMoeSharedMoE(config)
+        self.input_layernorm = GraniteMoeSharedRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GraniteMoeSharedRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+        self.shared_mlp = None if config.shared_intermediate_size == 0 else GraniteMoeSharedMLP(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_router_logits: Optional[bool] = False,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[GraniteFlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
+                padding-free training and/or improve torch.compile performance.
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        moe_hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+
+        if self.shared_mlp is None:
+            hidden_states = moe_hidden_states
+        else:
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+
+        del moe_hidden_states
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+@auto_docstring
+class GraniteMoeSharedPreTrainedModel(PreTrainedModel):
+    config: GraniteMoeSharedConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GraniteMoeSharedDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, GraniteMoeSharedParallelExperts):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+class GraniteMoeSharedRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GraniteMoeSharedConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class GraniteMoeSharedModel(GraniteMoeSharedPreTrainedModel):
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GraniteMoeSharedDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GraniteMoeSharedRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        self.embedding_multiplier = config.embedding_multiplier
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        self.position_embedding_type = config.position_embedding_type
+        self.rotary_emb = GraniteMoeSharedRotaryEmbedding(config) if self.position_embedding_type == "rope" else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        position_embeddings = None
+        # create position embeddings to be shared across the decoder layers
+        if self.rotary_emb is not None:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                output_router_logits=output_router_logits,
+                position_embeddings=position_embeddings,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns] if v is not None
+            )
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, routing_weights.shape[1]))
+            .reshape(-1, routing_weights.shape[1])
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
+    overall_loss = torch.sum(
+        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
+    )
+    return overall_loss * num_experts
+
+
+class GraniteMoeSharedForCausalLM(GraniteMoeSharedPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.model = GraniteMoeSharedModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, MoeCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GraniteMoeSharedForCausalLM
+
+        >>> model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        # Only compute necessary logits
+        hidden_states = outputs[0]
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits / self.config.logits_scaling
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+__all__ = ["GraniteMoeSharedForCausalLM", "GraniteMoeSharedModel", "GraniteMoeSharedPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.py b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.py
new file mode 100644
index 0000000000000000000000000000000000000000..4170deca2e1d1f623aad4aa0265b9fbe68256e2f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, TypedDict
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...processing_utils import Unpack
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+from ..granitemoe.modeling_granitemoe import (
+    GraniteMoeDecoderLayer,
+    GraniteMoeForCausalLM,
+    GraniteMoeModel,
+    GraniteMoePreTrainedModel,
+)
+from .configuration_granitemoeshared import GraniteMoeSharedConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteFlashAttentionKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
+    Use cases include padding-free training and fewer `torch.compile` graph breaks.
+
+    Attributes:
+        cu_seq_lens_q (`torch.LongTensor`)
+            Gets cumulative sequence length for query state.
+        cu_seq_lens_k (`torch.LongTensor`)
+            Gets cumulative sequence length for key state.
+        max_length_q (`int`):
+            Maximum sequence length for query state.
+        max_length_k (`int`):
+            Maximum sequence length for key state.
+        seq_idx (`torch.IntTensor):
+            Index of each packed sequence.
+    """
+
+    cu_seq_lens_q: torch.LongTensor
+    cu_seq_lens_k: torch.LongTensor
+    max_length_q: int
+    max_length_k: int
+    seq_idx: torch.IntTensor
+
+
+class GraniteMoeSharedMLP(nn.Module):
+    """
+    MLP layer for shared experts
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = nn.Linear(self.input_size, self.hidden_size * 2, bias=False)
+        self.output_linear = nn.Linear(self.hidden_size, self.input_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.input_linear(hidden_states)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        hidden_states = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedDecoderLayer(GraniteMoeDecoderLayer):
+    def __init__(self, config: GraniteMoeSharedConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.shared_mlp = None if config.shared_intermediate_size == 0 else GraniteMoeSharedMLP(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_router_logits: Optional[bool] = False,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[GraniteFlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
+                padding-free training and/or improve torch.compile performance.
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        moe_hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+
+        if self.shared_mlp is None:
+            hidden_states = moe_hidden_states
+        else:
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+
+        del moe_hidden_states
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class GraniteMoeSharedPreTrainedModel(GraniteMoePreTrainedModel):
+    config: GraniteMoeSharedConfig
+    _no_split_modules = ["GraniteMoeSharedDecoderLayer"]
+
+
+class GraniteMoeSharedModel(GraniteMoeModel):
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [GraniteMoeSharedDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+
+class GraniteMoeSharedForCausalLM(GraniteMoeForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.model = GraniteMoeSharedModel(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+__all__ = ["GraniteMoeSharedForCausalLM", "GraniteMoeSharedModel", "GraniteMoeSharedPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__init__.py b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68f678c9cf9c9b239a79e1ded06958c43ecbaebe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_grounding_dino import *
+    from .image_processing_grounding_dino import *
+    from .image_processing_grounding_dino_fast import *
+    from .modeling_grounding_dino import *
+    from .processing_grounding_dino import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4636332df21e15ff1433f6de85fb14fd8251a0f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6baf42f4d4c02d405f892cf355f61863daa6a51f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53f8411765a69097a284a01a533b94430b88f9bd
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..141f10f699fbe1b2d6e6109910575cd9ca762e29
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modeling_grounding_dino.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modeling_grounding_dino.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83261ee7f0f01175fb4fff1be21bbc0a6d581f66
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modeling_grounding_dino.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ef467eeb45404c13d9fd0980e2c71729fb09a11
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93996823d64bd98ef236bb6e2c609adb03a47313
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/configuration_grounding_dino.py b/phivenv/Lib/site-packages/transformers/models/grounding_dino/configuration_grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..838a897f70afcc099df80de3d9058173a36a0b3e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -0,0 +1,309 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Grounding DINO model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class GroundingDinoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GroundingDinoModel`]. It is used to instantiate a
+    Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Grounding DINO
+    [IDEA-Research/grounding-dino-tiny](https://huggingface.co/IDEA-Research/grounding-dino-tiny) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`):
+            The config object or dictionary of the text backbone.
+        num_queries (`int`, *optional*, defaults to 900):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`GroundingDinoModel`] can detect in a single image.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        num_feature_levels (`int`, *optional*, defaults to 4):
+            The number of input feature levels.
+        encoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the encoder.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        two_stage (`bool`, *optional*, defaults to `True`):
+            Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+            Grounding DINO, which are further fed into the decoder for iterative bounding box refinement.
+        class_cost (`float`, *optional*, defaults to 1.0):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5.0):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2.0):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5.0):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2.0):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        focal_alpha (`float`, *optional*, defaults to 0.25):
+            Alpha parameter in the focal loss.
+        disable_custom_kernels (`bool`, *optional*, defaults to `False`):
+            Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+            kernels are not supported by PyTorch ONNX export.
+        max_text_len (`int`, *optional*, defaults to 256):
+            The maximum length of the text input.
+        text_enhancer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the text enhancer.
+        fusion_droppath (`float`, *optional*, defaults to 0.1):
+            The droppath ratio for the fusion module.
+        fusion_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the fusion module.
+        embedding_init_target (`bool`, *optional*, defaults to `True`):
+            Whether to initialize the target with Embedding weights.
+        query_dim (`int`, *optional*, defaults to 4):
+            The dimension of the query vector.
+        decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`):
+            Whether to share the bbox regression head for all decoder layers.
+        two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`):
+            Whether to share the bbox embedding between the two-stage bbox generator and the region proposal
+            generation.
+        positional_embedding_temperature (`float`, *optional*, defaults to 20):
+            The temperature for Sine Positional Embedding that is used together with vision backbone.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+
+    Examples:
+
+    ```python
+    >>> from transformers import GroundingDinoConfig, GroundingDinoModel
+
+    >>> # Initializing a Grounding DINO IDEA-Research/grounding-dino-tiny style configuration
+    >>> configuration = GroundingDinoConfig()
+
+    >>> # Initializing a model (with random weights) from the IDEA-Research/grounding-dino-tiny style configuration
+    >>> model = GroundingDinoModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "grounding-dino"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        text_config=None,
+        num_queries=900,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        num_feature_levels=4,
+        encoder_n_points=4,
+        decoder_n_points=4,
+        two_stage=True,
+        class_cost=1.0,
+        bbox_cost=5.0,
+        giou_cost=2.0,
+        bbox_loss_coefficient=5.0,
+        giou_loss_coefficient=2.0,
+        focal_alpha=0.25,
+        disable_custom_kernels=False,
+        # other parameters
+        max_text_len=256,
+        text_enhancer_dropout=0.0,
+        fusion_droppath=0.1,
+        fusion_dropout=0.0,
+        embedding_init_target=True,
+        query_dim=4,
+        decoder_bbox_embed_share=True,
+        two_stage_bbox_embed_share=False,
+        positional_embedding_temperature=20,
+        init_std=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
+            backbone_config = CONFIG_MAPPING["swin"](
+                window_size=7,
+                image_size=224,
+                embed_dim=96,
+                depths=[2, 2, 6, 2],
+                num_heads=[3, 6, 12, 24],
+                out_indices=[2, 3, 4],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.num_queries = num_queries
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        # deformable attributes
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.focal_alpha = focal_alpha
+        self.disable_custom_kernels = disable_custom_kernels
+        # Text backbone
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "bert")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["bert"]()
+
+        self.text_config = text_config
+        self.max_text_len = max_text_len
+
+        # Text Enhancer
+        self.text_enhancer_dropout = text_enhancer_dropout
+        # Fusion
+        self.fusion_droppath = fusion_droppath
+        self.fusion_dropout = fusion_dropout
+        # Others
+        self.embedding_init_target = embedding_init_target
+        self.query_dim = query_dim
+        self.decoder_bbox_embed_share = decoder_bbox_embed_share
+        self.two_stage_bbox_embed_share = two_stage_bbox_embed_share
+        if two_stage_bbox_embed_share and not decoder_bbox_embed_share:
+            raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.")
+        self.positional_embedding_temperature = positional_embedding_temperature
+        self.init_std = init_std
+        self.layer_norm_eps = layer_norm_eps
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        sub_configs = {}
+        backbone_config = getattr(self, "backbone_config", None)
+        text_config = getattr(self, "text_config", None)
+        if isinstance(backbone_config, PretrainedConfig):
+            sub_configs["backbone_config"] = type(backbone_config)
+        if isinstance(text_config, PretrainedConfig):
+            sub_configs["text_config"] = type(self.text_config)
+        return sub_configs
+
+
+__all__ = ["GroundingDinoConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino.py b/phivenv/Lib/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f16a5387a05f3d5e1cc5b147d5d957a23661713
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -0,0 +1,1621 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_annotations,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    ExplicitEnum,
+    TensorType,
+    is_flax_available,
+    is_jax_tensor,
+    is_scipy_available,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+if is_vision_available():
+    import PIL
+
+if is_scipy_available():
+    import scipy.special
+    import scipy.stats
+
+if TYPE_CHECKING:
+    from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AnnotationType = dict[str, Union[int, str, list[dict]]]
+
+
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, tuple[int, int], list[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `tuple[int, int]` or `list[int]`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
+def get_image_size_for_max_height_width(
+    input_image: np.ndarray,
+    max_height: int,
+    max_width: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`list[list[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDino
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by GroundingDino.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDino
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> dict:
+    """
+    Prepare a coco panoptic annotation for GroundingDino.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image
+def get_segmentation_image(
+    masks: np.ndarray, input_size: tuple, target_size: tuple, stuff_equiv_classes, deduplicate=False
+):
+    h, w = input_size
+    final_h, final_w = target_size
+
+    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
+
+    if m_id.shape[-1] == 0:
+        # We didn't detect any mask :(
+        m_id = np.zeros((h, w), dtype=np.int64)
+    else:
+        m_id = m_id.argmax(-1).reshape(h, w)
+
+    if deduplicate:
+        # Merge the masks corresponding to the same stuff class
+        for equiv in stuff_equiv_classes.values():
+            for eq_id in equiv:
+                m_id[m_id == eq_id] = equiv[0]
+
+    seg_img = id_to_rgb(m_id)
+    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
+    return seg_img
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_mask_area
+def get_mask_area(seg_img: np.ndarray, target_size: tuple[int, int], n_classes: int) -> np.ndarray:
+    final_h, final_w = target_size
+    np_seg_img = seg_img.astype(np.uint8)
+    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
+    m_id = rgb_to_id(np_seg_img)
+    area = [(m_id == i).sum() for i in range(n_classes)]
+    return area
+
+
+# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
+def score_labels_from_class_probabilities(logits: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    probs = scipy.special.softmax(logits, axis=-1)
+    labels = probs.argmax(-1, keepdims=True)
+    scores = np.take_along_axis(probs, labels, axis=-1)
+    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+    return scores, labels
+
+
+# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample
+def post_process_panoptic_sample(
+    out_logits: np.ndarray,
+    masks: np.ndarray,
+    boxes: np.ndarray,
+    processed_size: tuple[int, int],
+    target_size: tuple[int, int],
+    is_thing_map: dict,
+    threshold=0.85,
+) -> dict:
+    """
+    Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.
+
+    Args:
+        out_logits (`torch.Tensor`):
+            The logits for this sample.
+        masks (`torch.Tensor`):
+            The predicted segmentation masks for this sample.
+        boxes (`torch.Tensor`):
+            The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
+            width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
+        processed_size (`tuple[int, int]`):
+            The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
+            after data augmentation but before batching.
+        target_size (`tuple[int, int]`):
+            The target size of the image, `(height, width)` corresponding to the requested final size of the
+            prediction.
+        is_thing_map (`Dict`):
+            A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
+        threshold (`float`, *optional*, defaults to 0.85):
+            The threshold used to binarize the segmentation masks.
+    """
+    # we filter empty queries and detection below threshold
+    scores, labels = score_labels_from_class_probabilities(out_logits)
+    keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
+
+    cur_scores = scores[keep]
+    cur_classes = labels[keep]
+    cur_boxes = center_to_corners_format(boxes[keep])
+
+    if len(cur_boxes) != len(cur_classes):
+        raise ValueError("Not as many boxes as there are classes")
+
+    cur_masks = masks[keep]
+    cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
+    cur_masks = safe_squeeze(cur_masks, 1)
+    b, h, w = cur_masks.shape
+
+    # It may be that we have several predicted masks for the same stuff class.
+    # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+    cur_masks = cur_masks.reshape(b, -1)
+    stuff_equiv_classes = defaultdict(list)
+    for k, label in enumerate(cur_classes):
+        if not is_thing_map[label]:
+            stuff_equiv_classes[label].append(k)
+
+    seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
+    area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
+
+    # We filter out any mask that is too small
+    if cur_classes.size() > 0:
+        # We know filter empty masks as long as we find some
+        filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+        while filtered_small.any():
+            cur_masks = cur_masks[~filtered_small]
+            cur_scores = cur_scores[~filtered_small]
+            cur_classes = cur_classes[~filtered_small]
+            seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
+            area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
+            filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+    else:
+        cur_classes = np.ones((1, 1), dtype=np.int64)
+
+    segments_info = [
+        {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
+        for i, (cat, a) in enumerate(zip(cur_classes, area))
+    ]
+    del cur_classes
+
+    with io.BytesIO() as out:
+        PIL.Image.fromarray(seg_img).save(out, format="PNG")
+        predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+
+    return predictions
+
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+    annotation: dict[str, Any],
+    orig_size: tuple[int, int],
+    target_size: tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`tuple[int, int]`):
+            The original size of the input image.
+        target_size (`tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
+def binary_mask_to_rle(mask):
+    """
+    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        mask (`torch.Tensor` or `numpy.array`):
+            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+            segment_id or class_id.
+    Returns:
+        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+        format.
+    """
+    if is_torch_tensor(mask):
+        mask = mask.numpy()
+
+    pixels = mask.flatten()
+    pixels = np.concatenate([[0], pixels, [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+    return list(runs)
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
+def convert_segmentation_to_rle(segmentation):
+    """
+    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        segmentation (`torch.Tensor` or `numpy.array`):
+            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+    Returns:
+        `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+    """
+    segment_ids = torch.unique(segmentation)
+
+    run_length_encodings = []
+    for idx in segment_ids:
+        mask = torch.where(segmentation == idx, 1, 0)
+        rle = binary_mask_to_rle(mask)
+        run_length_encodings.append(rle)
+
+    return run_length_encodings
+
+
+# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_area = (mask_probs[k] >= mask_threshold).sum()
+    mask_exists = mask_k_area > 0 and original_area > 0
+
+    # Eliminate disconnected tiny segments
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, mask_k
+
+
+# Copied from transformers.models.detr.image_processing_detr.compute_segments
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[set[int]] = None,
+    target_size: Optional[tuple[int, int]] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: list[dict] = []
+
+    if target_size is not None:
+        mask_probs = nn.functional.interpolate(
+            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+        )[0]
+
+    current_segment_id = 0
+
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+
+    # Keep track of instances of each class
+    stuff_memory_list: dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+
+    return segmentation, segments
+
+
+# Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes
+def _scale_boxes(boxes, target_sizes):
+    """
+    Scale batch of bounding boxes to the target sizes.
+
+    Args:
+        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
+        target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
+            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
+    """
+
+    if isinstance(target_sizes, (list, tuple)):
+        image_height = torch.tensor([i[0] for i in target_sizes])
+        image_width = torch.tensor([i[1] for i in target_sizes])
+    elif isinstance(target_sizes, torch.Tensor):
+        image_height, image_width = target_sizes.unbind(1)
+    else:
+        raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor")
+
+    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
+    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
+    boxes = boxes * scale_factor
+    return boxes
+
+
+class GroundingDinoImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Grounding DINO image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize`
+            parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self._valid_processor_keys = [
+            "images",
+            "annotations",
+            "return_segmentation_masks",
+            "masks_path",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "do_convert_annotations",
+            "image_mean",
+            "image_std",
+            "do_pad",
+            "pad_size",
+            "format",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    @classmethod
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `GroundingDinoImageProcessor.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDino
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into GroundingDino model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            new_size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "max_height" in size and "max_width" in size:
+            new_size = get_image_size_for_max_height_width(
+                image, size["max_height"], size["max_width"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image,
+            size=new_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: list[np.ndarray],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            images (list[`np.ndarray`]):
+                Images to pad.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                Annotations to transform according to the padding that is applied to the images.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        if pad_size is not None:
+            padded_size = (pad_size["height"], pad_size["width"])
+        else:
+            padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
+                image,
+                padded_size,
+                annotation,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
+            )
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            size = kwargs.pop("max_size")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=True,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=do_convert_annotations,
+                return_tensors=return_tensors,
+                pad_size=pad_size,
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
+
+        return encoded_inputs
+
+    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino
+    def post_process_object_detection(
+        self,
+        outputs: "GroundingDinoObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+    ):
+        """
+        Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`GroundingDinoObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+        """
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
+
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
+
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        batch_boxes = center_to_corners_format(batch_boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
+
+        results = []
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
+
+        return results
+
+
+__all__ = ["GroundingDinoImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py b/phivenv/Lib/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..317d1e483342a26c740b9baf12b0090797aef10e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
@@ -0,0 +1,808 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/grounding_dino/modular_grounding_dino.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_grounding_dino.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import pathlib
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+from ...utils.import_utils import requires
+from .image_processing_grounding_dino import get_size_with_aspect_ratio
+
+
+if TYPE_CHECKING:
+    from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    do_pad (`bool`, *optional*, defaults to `True`):
+        Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+        method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+        If `pad_size` is provided, the image will be padded to the specified dimensions.
+        Otherwise, the image will be padded to the maximum height and width of the batch.
+    pad_size (`dict[str, int]`, *optional*):
+        The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+        provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+        height and width in the batch.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    do_pad: Optional[bool]
+    pad_size: Optional[dict[str, int]]
+    return_segmentation_masks: Optional[bool]
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`list[list[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
+        mask = torch.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, axis=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
+
+    return masks
+
+
+# inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by GROUNDING_DINO.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
+    x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = torch.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * torch.unsqueeze(x, 0)
+    x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
+    x_min = (
+        torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    y_mask = masks * torch.unsqueeze(y, 0)
+    y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
+    y_min = (
+        torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, torch.Tensor) and len(color.shape) == 3:
+        if color.dtype == torch.uint8:
+            color = color.to(torch.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def prepare_coco_panoptic_annotation(
+    image: torch.Tensor,
+    target: dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> dict:
+    """
+    Prepare a coco panoptic annotation for GROUNDING_DINO.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = torch.as_tensor(
+        [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
+    )
+    new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+    new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+
+    if "segments_info" in target:
+        masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
+        masks = rgb_to_id(masks)
+
+        ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
+        masks = masks == ids[:, None, None]
+        masks = masks.to(torch.bool)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = torch.as_tensor(
+            [segment_info["category_id"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["iscrowd"] = torch.as_tensor(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["area"] = torch.as_tensor(
+            [segment_info["area"] for segment_info in target["segments_info"]],
+            dtype=torch.float32,
+            device=image.device,
+        )
+
+    return new_target
+
+
+def _scale_boxes(boxes, target_sizes):
+    """
+    Scale batch of bounding boxes to the target sizes.
+
+    Args:
+        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
+        target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
+            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
+    """
+
+    if isinstance(target_sizes, (list, tuple)):
+        image_height = torch.tensor([i[0] for i in target_sizes])
+        image_width = torch.tensor([i[1] for i in target_sizes])
+    elif isinstance(target_sizes, torch.Tensor):
+        image_height, image_width = target_sizes.unbind(1)
+    else:
+        raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor")
+
+    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
+    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
+    boxes = boxes * scale_factor
+    return boxes
+
+
+@auto_docstring
+@requires(backends=("torchvision", "torch"))
+class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = True
+    size = {"shortest_edge": 800, "longest_edge": 1333}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = GroundingDinoFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+
+        size = kwargs.pop("size", None)
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `GroundingDinoImageProcessorFast.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into GROUNDING_DINO model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation: dict[str, Any],
+        orig_size: tuple[int, int],
+        target_size: tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`tuple[int, int]`):
+                The original size of the input image.
+            target_size (`tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = (
+            interpolation
+            if interpolation is not None
+            else F.InterpolationMode.NEAREST_EXACT
+            if is_torchvision_v2_available()
+            else F.InterpolationMode.NEAREST
+        )
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images. If annotation is for object
+            detection, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                dictionary. An image can have no annotations, in which case the list should be empty.
+            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                An image can have no segments, in which case the list should be empty.
+            - "file_name" (`str`): The file name of the image.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            kwargs["size"] = kwargs.pop("max_size")
+
+        return super().preprocess(images, annotations, masks_path, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]],
+        masks_path: Optional[Union[str, pathlib.Path]],
+        return_segmentation_masks: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: bool,
+        pad_size: Optional[dict[str, int]],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        data = {}
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> GROUNDING_DINO target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs: "GroundingDinoObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+    ):
+        """
+        Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`GroundingDinoObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+        """
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
+
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
+
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        batch_boxes = center_to_corners_format(batch_boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
+
+        results = []
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
+
+        return results
+
+
+__all__ = ["GroundingDinoImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/modeling_grounding_dino.py b/phivenv/Lib/site-packages/transformers/models/grounding_dino/modeling_grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..939213e78d82b3e777dcd8fd3ecae6d0d1b5c453
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -0,0 +1,2639 @@
+# coding=utf-8
+# Copyright 2024 IDEA Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Grounding DINO model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    is_timm_available,
+    requires_backends,
+)
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
+from ...utils import auto_docstring, logging
+from ...utils.backbone_utils import load_backbone
+from ..auto import AutoModel
+from .configuration_grounding_dino import GroundingDinoConfig
+
+
+if is_timm_available():
+    from timm import create_model
+
+
+logger = logging.get_logger(__name__)
+
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention
+class MultiScaleDeformableAttention(nn.Module):
+    def forward(
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: list[tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
+    ):
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
+        )
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
+        )
+        return output.transpose(1, 2).contiguous()
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the GroundingDinoDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+    """
+)
+class GroundingDinoDecoderOutput(ModelOutput):
+    r"""
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the GroundingDinoEncoder. This class extends BaseModelOutput, due to:
+    - vision and text last hidden states
+    - vision and text intermediate hidden states
+    """
+)
+class GroundingDinoEncoderOutput(ModelOutput):
+    r"""
+    last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the vision encoder.
+    last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the text encoder.
+    vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+        layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+        output of each layer plus the initial embedding outputs.
+    text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+        of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+        each layer plus the initial embedding outputs.
+    """
+
+    last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    last_hidden_state_text: Optional[torch.FloatTensor] = None
+    vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    text_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the Grounding DINO encoder-decoder model.
+    """
+)
+class GroundingDinoModelOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+        layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+        output of each layer plus the initial embedding outputs.
+    encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+        of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+        each layer plus the initial embedding outputs.
+    encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+        sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+        weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
+        multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
+        bi-attention heads.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
+        region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
+        background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+    encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`GroundingDinoForObjectDetection`].
+    """
+)
+class GroundingDinoObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~GroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the
+        unnormalized bounding boxes.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+        layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+        output of each layer plus the initial embedding outputs.
+    encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+        of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+        each layer plus the initial embedding outputs.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
+        region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
+        background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+    encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
+    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Encoded candidate labels sequence. Used in processor to post process object detection result.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
+    input_ids: Optional[torch.LongTensor] = None
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino
+class GroundingDinoFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDino
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDinoFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = GroundingDinoFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class GroundingDinoConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+    nn.BatchNorm2d layers are replaced by GroundingDinoFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                **config.backbone_kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
+
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+
+        backbone_model_type = None
+        if config.backbone is not None:
+            backbone_model_type = config.backbone
+        elif config.backbone_config is not None:
+            backbone_model_type = config.backbone_config.model_type
+        else:
+            raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+
+    # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDino
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDino
+class GroundingDinoConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+class GroundingDinoSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.embedding_dim = config.d_model // 2
+        self.temperature = config.positional_embedding_temperature
+        self.scale = 2 * math.pi
+
+    def forward(self, pixel_values, pixel_mask):
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        eps = 1e-6
+        y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+        x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class GroundingDinoLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        embedding_dim = config.d_model // 2
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(config):
+    if config.position_embedding_type == "sine":
+        position_embedding = GroundingDinoSinePositionEmbedding(config)
+    elif config.position_embedding_type == "learned":
+        position_embedding = GroundingDinoLearnedPositionEmbedding(config)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino, Deformable DETR->Grounding DINO
+class GroundingDinoMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int):
+        super().__init__()
+
+        self.attn = MultiScaleDeformableAttention()
+
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in GroundingDinoMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        # Ignore copy
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif num_coordinates == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class GroundingDinoTextEnhancerLayer(nn.Module):
+    """Vanilla Transformer with text embeddings as input"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = GroundingDinoMultiheadAttention(
+            config, num_attention_heads=config.encoder_attention_heads // 2
+        )
+
+        # Implementation of Feedforward model
+        self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
+
+        self.layer_norm_before = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layer_norm_after = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+
+        self.activation = ACT2FN[config.activation_function]
+        self.num_heads = config.encoder_attention_heads // 2
+        self.dropout = config.text_enhancer_dropout
+
+    def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
+        return hidden_state if position_embeddings is None else hidden_state + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_masks: Optional[torch.BoolTensor] = None,
+        position_embeddings: Optional[torch.FloatTensor] = None,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        """Text self-attention to enhance projection of text features generated by
+        the text encoder (AutoModel based on text_config) within GroundingDinoEncoderLayer
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
+                Text features generated by the text encoder.
+            attention_masks (`torch.BoolTensor`, *optional*):
+                Attention mask for text self-attention. False for real tokens and True for padding tokens.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings to be added to the hidden states.
+
+        Returns:
+            `tuple(torch.FloatTensor)` comprising two elements:
+            - **hidden_states** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) --
+                Output of the text self-attention layer.
+            - **attention_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, sequence_length,
+              sequence_length)`) --
+                Attention weights of the text self-attention layer.
+        """
+
+        # repeat attn mask
+        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
+            # batch_size, num_queries, num_keys
+            attention_masks = attention_masks[:, None, :, :]
+            attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1)
+
+            dtype = hidden_states.dtype
+            attention_masks = attention_masks.to(dtype=dtype)  # fp16 compatibility
+            attention_masks = (1.0 - attention_masks) * torch.finfo(dtype).min
+
+        queries = keys = self.with_pos_embed(hidden_states, position_embeddings)
+        attention_output, attention_weights = self.self_attn(
+            queries=queries,
+            keys=keys,
+            values=hidden_states,
+            attention_mask=attention_masks,
+            output_attentions=True,
+        )
+        attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + attention_output
+        hidden_states = self.layer_norm_before(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + residual
+        hidden_states = self.layer_norm_after(hidden_states)
+
+        return hidden_states, attention_weights
+
+
+class GroundingDinoBiMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        vision_dim = text_dim = config.d_model
+        embed_dim = config.encoder_ffn_dim // 2
+        num_heads = config.encoder_attention_heads // 2
+        dropout = config.fusion_dropout
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.vision_dim = vision_dim
+        self.text_dim = text_dim
+
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by `num_heads` (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_dim, self.embed_dim)
+        self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim)
+
+        self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
+        self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
+
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        vision_features: torch.FloatTensor,
+        text_features: torch.FloatTensor,
+        vision_attention_mask: Optional[torch.BoolTensor] = None,
+        text_attention_mask: Optional[torch.BoolTensor] = None,
+    ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]:
+        """Image-to-text and text-to-image cross-attention
+
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`):
+                Projected flattened image features generated by the vision backbone.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`):
+                Projected text features generated by the text encoder.
+            vision_attention_mask (`torch.BoolTensor`, **optional**):
+                Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens.
+            text_attention_mask (`torch.BoolTensor`, **optional**):
+                Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens.
+
+        Returns:
+            `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an attention
+            output and weights:
+            - **vision_attn_output** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_din)`)
+              --
+                Output of the image-to-text cross-attention layer.
+            - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length,
+              vision_sequence_length)`) --
+                Attention weights of the image-to-text cross-attention layer.
+            - **text_attn_output** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`) --
+                Output of the text-to-image cross-attention layer.
+            - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length,
+              text_sequence_length)`) --
+                Attention weights of the text-to-image cross-attention layer.
+        """
+        batch_size, tgt_len, _ = vision_features.size()
+
+        vision_query_states = self.vision_proj(vision_features) * self.scale
+        vision_query_states = self._reshape(vision_query_states, tgt_len, batch_size)
+
+        text_key_states = self.text_proj(text_features)
+        text_key_states = self._reshape(text_key_states, -1, batch_size)
+
+        vision_value_states = self.values_vision_proj(vision_features)
+        vision_value_states = self._reshape(vision_value_states, -1, batch_size)
+
+        text_value_states = self.values_text_proj(text_features)
+        text_value_states = self._reshape(text_value_states, -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+
+        vision_query_states = vision_query_states.view(*proj_shape)
+        text_key_states = text_key_states.view(*proj_shape)
+        vision_value_states = vision_value_states.view(*proj_shape)
+        text_value_states = text_value_states.view(*proj_shape)
+
+        src_len = text_key_states.size(1)
+        attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+
+        if attn_weights.size() != (batch_size * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        attn_weights = attn_weights - attn_weights.max()
+        # Do not increase -50000/50000, data type half has quite limited range
+        attn_weights = torch.clamp(attn_weights, min=-50000, max=50000)
+
+        attn_weights_transposed = attn_weights.transpose(1, 2)
+        text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0]
+
+        # Do not increase -50000/50000, data type half has quite limited range
+        text_attn_weights = torch.clamp(text_attn_weights, min=-50000, max=50000)
+
+        # mask vision for language
+        if vision_attention_mask is not None:
+            vision_attention_mask = (
+                vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            text_attn_weights.masked_fill_(vision_attention_mask, float("-inf"))
+
+        text_attn_weights = text_attn_weights.softmax(dim=-1)
+
+        # mask language for vision
+        if text_attention_mask is not None:
+            text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights.masked_fill_(text_attention_mask, float("-inf"))
+        vision_attn_weights = attn_weights.softmax(dim=-1)
+
+        vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training)
+        text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training)
+
+        vision_attn_output = torch.bmm(vision_attn_probs, text_value_states)
+        text_attn_output = torch.bmm(text_attn_probs, vision_value_states)
+
+        if vision_attn_output.size() != (batch_size * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`vision_attn_output` should be of size {(batch_size, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}"
+            )
+
+        if text_attn_output.size() != (batch_size * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`text_attn_output` should be of size {(batch_size, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}"
+            )
+
+        vision_attn_output = vision_attn_output.view(batch_size, self.num_heads, tgt_len, self.head_dim)
+        vision_attn_output = vision_attn_output.transpose(1, 2)
+        vision_attn_output = vision_attn_output.reshape(batch_size, tgt_len, self.embed_dim)
+
+        text_attn_output = text_attn_output.view(batch_size, self.num_heads, src_len, self.head_dim)
+        text_attn_output = text_attn_output.transpose(1, 2)
+        text_attn_output = text_attn_output.reshape(batch_size, src_len, self.embed_dim)
+
+        vision_attn_output = self.out_vision_proj(vision_attn_output)
+        text_attn_output = self.out_text_proj(text_attn_output)
+
+        return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDino
+class GroundingDinoDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class GroundingDinoFusionLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        drop_path = config.fusion_droppath
+
+        # pre layer norm
+        self.layer_norm_vision = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layer_norm_text = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.attn = GroundingDinoBiMultiHeadAttention(config)
+
+        # add layer scale for training stability
+        self.drop_path = GroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        init_values = 1e-4
+        self.vision_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True)
+        self.text_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True)
+
+    def forward(
+        self,
+        vision_features: torch.FloatTensor,
+        text_features: torch.FloatTensor,
+        attention_mask_vision: Optional[torch.BoolTensor] = None,
+        attention_mask_text: Optional[torch.BoolTensor] = None,
+    ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]:
+        """Image and text features fusion
+
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`):
+                Projected flattened image features generated by the vision backbone.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`):
+                Projected text features generated by the text encoder.
+            attention_mask_vision (`torch.BoolTensor`, **optional**):
+                Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens.
+            attention_mask_text (`torch.BoolTensor`, **optional**):
+                Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens.
+
+        Returns:
+            `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an enhanced
+            feature and attention output and weights:
+            - **vision_features** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, vision_dim)`) --
+                Updated vision features with attention output from image-to-text cross-attention layer.
+            - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length,
+              vision_sequence_length)`) --
+                Attention weights of the image-to-text cross-attention layer.
+            - **text_features** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, text_dim)`) --
+                Updated text features with attention output from text-to-image cross-attention layer.
+            - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length,
+              text_sequence_length)`) --
+                Attention weights of the text-to-image cross-attention layer.
+        """
+        vision_features = self.layer_norm_vision(vision_features)
+        text_features = self.layer_norm_text(text_features)
+        (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
+            vision_features,
+            text_features,
+            vision_attention_mask=attention_mask_vision,
+            text_attention_mask=attention_mask_text,
+        )
+        vision_features = vision_features + self.drop_path(self.vision_param * delta_v)
+        text_features = text_features + self.drop_path(self.text_param * delta_t)
+
+        return (vision_features, vision_attn), (text_features, text_attn)
+
+
+class GroundingDinoDeformableLayer(nn.Module):
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = GroundingDinoMultiscaleDeformableAttention(
+            config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            spatial_shapes_list (`list[tuple[int, int]]`, *optional*):
+                Spatial shapes of the backbone feature maps (but as list for export compatibility).
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return hidden_states, attn_weights
+
+
+# Based on https://github.com/IDEA-Research/GroundingDINO/blob/2b62f419c292ca9c518daae55512fabc3fead4a4/groundingdino/models/GroundingDINO/utils.py#L24
+def get_sine_pos_embed(
+    pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True
+) -> Tensor:
+    """
+    Generate sine position embeddings from a position tensor.
+
+    Args:
+        pos_tensor (torch.Tensor):
+            Tensor containing positions. Shape: [..., n].
+        num_pos_feats (`int`, *optional*, defaults to 128):
+            Projected shape for each float in the tensor.
+        temperature (`int`, *optional*, defaults to 10000):
+            Temperature in the sine/cosine function.
+        exchange_xy (`bool`, *optional*, defaults to `True`):
+            Exchange pos x and pos y. For example, input tensor is [x,y], the results will be [pos(y), pos(x)].
+
+    Returns:
+        position_embeddings (torch.Tensor): shape: [..., n * hidden_size].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+
+    pos_tensor = pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)
+    position_embeddings = [sine_func(x) for x in pos_tensor]
+    if exchange_xy:
+        position_embeddings[0], position_embeddings[1] = position_embeddings[1], position_embeddings[0]
+    position_embeddings = torch.cat(position_embeddings, dim=-1)
+    return position_embeddings
+
+
+class GroundingDinoEncoderLayer(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+
+        self.d_model = config.d_model
+
+        self.text_enhancer_layer = GroundingDinoTextEnhancerLayer(config)
+        self.fusion_layer = GroundingDinoFusionLayer(config)
+        self.deformable_layer = GroundingDinoDeformableLayer(config)
+
+    def get_text_position_embeddings(
+        self,
+        text_features: Tensor,
+        text_position_embedding: Optional[torch.Tensor],
+        text_position_ids: Optional[torch.Tensor],
+    ) -> Tensor:
+        batch_size, seq_length, _ = text_features.shape
+        if text_position_embedding is None and text_position_ids is None:
+            text_position_embedding = torch.arange(seq_length, device=text_features.device)
+            text_position_embedding = text_position_embedding.float()
+            text_position_embedding = text_position_embedding.unsqueeze(0).unsqueeze(-1)
+            text_position_embedding = text_position_embedding.repeat(batch_size, 1, 1)
+            text_position_embedding = get_sine_pos_embed(
+                text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False
+            )
+        if text_position_ids is not None:
+            text_position_embedding = get_sine_pos_embed(
+                text_position_ids[..., None], num_pos_feats=self.d_model, exchange_xy=False
+            )
+
+        return text_position_embedding
+
+    def forward(
+        self,
+        vision_features: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        spatial_shapes_list: list[tuple[int, int]],
+        level_start_index: Tensor,
+        key_padding_mask: Tensor,
+        reference_points: Tensor,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None,
+    ):
+        text_position_embedding = self.get_text_position_embeddings(
+            text_features, text_position_embedding, text_position_ids
+        )
+
+        (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer(
+            vision_features=vision_features,
+            text_features=text_features,
+            attention_mask_vision=key_padding_mask,
+            attention_mask_text=text_attention_mask,
+        )
+
+        (text_features, text_enhanced_attn) = self.text_enhancer_layer(
+            hidden_states=text_features,
+            attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
+            position_embeddings=(text_position_embedding if text_position_embedding is not None else None),
+        )
+
+        (vision_features, vision_deformable_attn) = self.deformable_layer(
+            hidden_states=vision_features,
+            attention_mask=~key_padding_mask,
+            position_embeddings=vision_position_embedding,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+        )
+
+        return (
+            (vision_features, text_features),
+            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn),
+        )
+
+
+class GroundingDinoMultiheadAttention(nn.Module):
+    """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`."""
+
+    def __init__(self, config, num_attention_heads=None):
+        super().__init__()
+        if config.hidden_size % num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({num_attention_heads})"
+            )
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(config.hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = queries.shape
+        query_layer = (
+            self.query(queries)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(keys).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        )
+        value_layer = (
+            self.value(values).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class GroundingDinoDecoderLayer(nn.Module):
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = GroundingDinoMultiheadAttention(config, num_attention_heads=config.decoder_attention_heads)
+
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        # cross-attention text
+        self.encoder_attn_text = GroundingDinoMultiheadAttention(
+            config, num_attention_heads=config.decoder_attention_heads
+        )
+        self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        # cross-attention
+        self.encoder_attn = GroundingDinoMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        vision_encoder_hidden_states: Optional[torch.Tensor] = None,
+        vision_encoder_attention_mask: Optional[torch.Tensor] = None,
+        text_encoder_hidden_states: Optional[torch.Tensor] = None,
+        text_encoder_attention_mask: Optional[torch.Tensor] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        residual = hidden_states
+
+        # Self Attention
+        queries = keys = self.with_pos_embed(hidden_states, position_embeddings)
+        hidden_states, self_attn_weights = self.self_attn(
+            queries=queries,
+            keys=keys,
+            values=hidden_states,
+            attention_mask=self_attn_mask,
+            output_attentions=True,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention Text
+        queries = self.with_pos_embed(hidden_states, position_embeddings)
+        hidden_states, text_cross_attn_weights = self.encoder_attn_text(
+            queries=queries,
+            keys=text_encoder_hidden_states,
+            values=text_encoder_hidden_states,
+            attention_mask=text_encoder_attention_mask,
+            output_attentions=True,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+        hidden_states = self.encoder_attn_text_layer_norm(hidden_states)
+
+        third_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=vision_encoder_attention_mask,
+            encoder_hidden_states=vision_encoder_hidden_states,
+            encoder_attention_mask=vision_encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = third_residual + hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class GroundingDinoContrastiveEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.max_text_len = config.max_text_len
+
+    def forward(
+        self,
+        vision_hidden_state: torch.FloatTensor,
+        text_hidden_state: torch.FloatTensor,
+        text_token_mask: torch.BoolTensor,
+    ) -> torch.FloatTensor:
+        output = vision_hidden_state @ text_hidden_state.transpose(-1, -2)
+        output = output.masked_fill(~text_token_mask[:, None, :], float("-inf"))
+
+        # padding to max_text_len
+        new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device)
+        new_output[..., : output.shape[-1]] = output
+
+        return new_output
+
+
+@auto_docstring
+class GroundingDinoPreTrainedModel(PreTrainedModel):
+    config: GroundingDinoConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, GroundingDinoLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, GroundingDinoMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+        elif isinstance(module, GroundingDinoBiMultiHeadAttention):
+            nn.init.xavier_uniform_(module.vision_proj.weight)
+            module.vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.text_proj.weight)
+            module.text_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.values_vision_proj.weight)
+            module.values_vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.values_text_proj.weight)
+            module.values_text_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.out_vision_proj.weight)
+            module.out_vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.out_text_proj.weight)
+            module.out_text_proj.bias.data.fill_(0)
+        elif isinstance(module, GroundingDinoFusionLayer):
+            module.vision_param.data.fill_(1e-4)
+            module.text_param.data.fill_(1e-4)
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GroundingDinoMLPPredictionHead):
+            nn.init.constant_(module.layers[-1].weight.data, 0)
+            nn.init.constant_(module.layers[-1].bias.data, 0)
+
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GroundingDinoDecoder):
+            module.gradient_checkpointing = value
+
+
+class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`GroundingDinoEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: GroundingDinoConfig
+    """
+
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([GroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                indexing="ij",
+            )
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        vision_features: Tensor,
+        vision_attention_mask: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        spatial_shapes_list: list[tuple[int, int]],
+        level_start_index: Tensor,
+        valid_ratios=None,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 0 for pixel features that are real (i.e. **not masked**),
+                - 1 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            spatial_shapes_list (`list[tuple[int, int]]`):
+                Spatial shapes of each feature map (but as list for export compatibility).
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Flattened text features that are passed to the encoder.
+            text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`):
+                Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`:
+                - 1 for text features that are real (i.e. **not masked**),
+                - 0 for text features that are padding (i.e. **masked**).
+            text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`):
+                Position ids for text features.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
+
+        encoder_vision_states = () if output_hidden_states else None
+        encoder_text_states = () if output_hidden_states else None
+        all_attns = () if output_attentions else None
+        all_attn_fused_text = () if output_attentions else None
+        all_attn_fused_vision = () if output_attentions else None
+        all_attn_enhanced_text = () if output_attentions else None
+        all_attn_deformable = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_vision_states += (vision_features,)
+                encoder_text_states += (text_features,)
+
+            (vision_features, text_features), attentions = encoder_layer(
+                vision_features=vision_features,
+                vision_position_embedding=vision_position_embedding,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                level_start_index=level_start_index,
+                key_padding_mask=vision_attention_mask,
+                reference_points=reference_points,
+                text_features=text_features,
+                text_attention_mask=text_attention_mask,
+                text_position_embedding=text_position_embedding,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=text_position_ids,
+            )
+
+            if output_attentions:
+                all_attn_fused_vision += (attentions[0],)
+                all_attn_fused_text += (attentions[1],)
+                all_attn_enhanced_text += (attentions[2],)
+                all_attn_deformable += (attentions[3],)
+
+        if output_hidden_states:
+            encoder_vision_states += (vision_features,)
+            encoder_text_states += (text_features,)
+
+        if output_attentions:
+            all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable)
+
+        if not return_dict:
+            enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns]
+            return tuple(v for v in enc_outputs if v is not None)
+        return GroundingDinoEncoderOutput(
+            last_hidden_state_vision=vision_features,
+            last_hidden_state_text=text_features,
+            vision_hidden_states=encoder_vision_states,
+            text_hidden_states=encoder_text_states,
+            attentions=all_attns,
+        )
+
+
+class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDinoDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Grounding DINO:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: GroundingDinoConfig
+    """
+
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.reference_points_head = GroundingDinoMLPPredictionHead(
+            config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
+        )
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+        self.query_scale = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds,
+        vision_encoder_hidden_states,
+        vision_encoder_attention_mask=None,
+        text_encoder_hidden_states=None,
+        text_encoder_attention_mask=None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        valid_ratios=None,
+        self_attn_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Last hidden state from encoder related to vision feature map.
+            vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+            text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Last hidden state from encoder related to text features.
+            text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            spatial_shapes_list (`list[tuple[int, int]]`):
+                Spatial shapes of the feature maps (but as list for export compatibility).
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+            self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`):
+                Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`:
+                - 1 for queries that are real (i.e. **not masked**),
+                - 0 for queries that are padding (i.e. **masked**).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_attns = () if output_attentions else None
+        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
+        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        if text_encoder_attention_mask is not None:
+            dtype = text_encoder_hidden_states.dtype
+
+            text_encoder_attention_mask = text_encoder_attention_mask[:, None, None, :]
+            text_encoder_attention_mask = text_encoder_attention_mask.repeat(
+                1, self.config.decoder_attention_heads, self.config.num_queries, 1
+            )
+            text_encoder_attention_mask = text_encoder_attention_mask.to(dtype=dtype)
+            text_encoder_attention_mask = text_encoder_attention_mask * torch.finfo(dtype).min
+
+        for idx, decoder_layer in enumerate(self.layers):
+            num_coordinates = reference_points.shape[-1]
+            if num_coordinates == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            elif num_coordinates == 2:
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            else:
+                raise ValueError("Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+            query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2)
+            query_pos = self.reference_points_head(query_pos)
+
+            # In original implementation they apply layer norm before outputting intermediate hidden states
+            # Though that's not through between layers so the layers use as input the output of the previous layer
+            # without layer norm
+            if output_hidden_states:
+                all_hidden_states += (self.layer_norm(hidden_states),)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    query_pos,
+                    reference_points_input,
+                    spatial_shapes,
+                    level_start_index,
+                    vision_encoder_hidden_states,
+                    vision_encoder_attention_mask,
+                    text_encoder_hidden_states,
+                    text_encoder_attention_mask,
+                    self_attn_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states=hidden_states,
+                    position_embeddings=query_pos,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    spatial_shapes_list=spatial_shapes_list,
+                    level_start_index=level_start_index,
+                    vision_encoder_hidden_states=vision_encoder_hidden_states,
+                    vision_encoder_attention_mask=vision_encoder_attention_mask,
+                    text_encoder_hidden_states=text_encoder_hidden_states,
+                    text_encoder_attention_mask=text_encoder_attention_mask,
+                    self_attn_mask=self_attn_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                num_coordinates = reference_points.shape[-1]
+                if num_coordinates == 4:
+                    new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5)
+                    new_reference_points = new_reference_points.sigmoid()
+                elif num_coordinates == 2:
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    raise ValueError(
+                        f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}"
+                    )
+                reference_points = new_reference_points.detach()
+
+            intermediate += (self.layer_norm(hidden_states),)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if text_encoder_hidden_states is not None:
+                    all_cross_attns_text += (layer_outputs[2],)
+
+                if vision_encoder_hidden_states is not None:
+                    all_cross_attns_vision += (layer_outputs[3],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if output_attentions:
+            all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_attns,
+                ]
+                if v is not None
+            )
+        return GroundingDinoDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_attns,
+        )
+
+
+# these correspond to [CLS], [SEP], . and ?
+SPECIAL_TOKENS = [101, 102, 1012, 1029]
+
+
+def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> tuple[Tensor, Tensor]:
+    """Generate attention mask between each pair of special tokens and positional ids.
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+    Returns:
+        `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids:
+        - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
+        - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
+    """
+    batch_size, num_token = input_ids.shape
+    # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
+    for special_token in SPECIAL_TOKENS:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
+    position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
+    """
+)
+class GroundingDinoModel(GroundingDinoPreTrainedModel):
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = GroundingDinoConvEncoder(config)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = GroundingDinoConvModel(backbone, position_embeddings)
+
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(backbone.intermediate_channel_sizes)
+            input_proj_list = []
+            for i in range(num_backbone_outs):
+                in_channels = backbone.intermediate_channel_sizes[i]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj_vision = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj_vision = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
+
+        # Create text backbone
+        self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False)
+        self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
+
+        if config.embedding_init_target or not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = GroundingDinoEncoder(config)
+        self.decoder = GroundingDinoDecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+            if (
+                config.two_stage_bbox_embed_share
+                and config.decoder_bbox_embed_share
+                and self.decoder.bbox_embed is not None
+            ):
+                self.encoder_output_bbox_embed = self.decoder.bbox_embed
+            else:
+                self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead(
+                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+                )
+
+            self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config)
+        else:
+            self.reference_points = nn.Embedding(config.num_queries, 4)
+
+        self.post_init()
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_height = valid_height.float() / height
+        valid_ratio_width = valid_width.float() / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
+        return valid_ratio
+
+    def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
+            padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
+            spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        current_position = 0
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
+            mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            current_position += height * width
+
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        return object_query, output_proposals
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Tensor,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        pixel_mask: Optional[Tensor] = None,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+
+            [What are token type IDs?](../glossary#token-type-ids)
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "a cat."
+
+        >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+        >>> model = AutoModel.from_pretrained("IDEA-Research/grounding-dino-tiny")
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 900, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
+
+        max_text_len = self.config.max_text_len
+        if text_self_attention_masks.shape[1] > max_text_len:
+            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+            position_ids = position_ids[:, :max_text_len]
+            input_ids = input_ids[:, :max_text_len]
+            token_type_ids = token_type_ids[:, :max_text_len]
+            text_token_mask = text_token_mask[:, :max_text_len]
+
+        # Extract text features from text backbone
+        text_outputs = self.text_backbone(
+            input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict
+        )
+        text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0]
+        text_features = self.text_projection(text_features)
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_maps = []
+        masks = []
+        for level, (source, mask) in enumerate(vision_features):
+            feature_maps.append(self.input_proj_vision[level](source))
+            masks.append(mask)
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(feature_maps):
+            _len_sources = len(feature_maps)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj_vision[level](vision_features[-1][0])
+                else:
+                    source = self.input_proj_vision[level](feature_maps[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                feature_maps.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
+
+        # Create queries
+        query_embeds = None
+        if self.config.embedding_init_target or self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes_list = []
+        for level, (source, mask, pos_embed) in enumerate(zip(feature_maps, masks, position_embeddings_list)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes_list.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                vision_features=source_flatten,
+                vision_attention_mask=~mask_flatten,
+                vision_position_embedding=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                text_features=text_features,
+                text_attention_mask=~text_token_mask,
+                text_position_embedding=None,
+                text_self_attention_masks=~text_self_attention_masks,
+                text_position_ids=position_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput):
+            encoder_outputs = GroundingDinoEncoderOutput(
+                last_hidden_state_vision=encoder_outputs[0],
+                last_hidden_state_text=encoder_outputs[1],
+                vision_hidden_states=encoder_outputs[2] if output_hidden_states else None,
+                text_hidden_states=encoder_outputs[3] if output_hidden_states else None,
+                attentions=encoder_outputs[-1] if output_attentions else None,
+            )
+
+        # Fifth, prepare decoder inputs
+        topk_proposals = None
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        encoder_logits = None
+        encoder_pred_boxes = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
+
+            # hack implementation as in two-stage Deformable DETR
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.encoder_output_class_embed(
+                object_query_embedding, encoder_outputs[1], text_token_mask
+            )
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.num_queries` proposals
+            topk = self.config.num_queries
+            topk_logits = enc_outputs_class.max(-1)[0]
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
+
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            if query_embeds is not None:
+                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                target = torch.gather(
+                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                ).detach()
+
+            # Set intermediate topk proposals (coords and class) for loss computation
+            encoder_pred_boxes = reference_points
+            encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask)
+        else:
+            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            vision_encoder_hidden_states=encoder_outputs[0],
+            vision_encoder_attention_mask=mask_flatten,
+            text_encoder_hidden_states=encoder_outputs[1],
+            text_encoder_attention_mask=~text_token_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            self_attn_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(
+                value
+                for value in [
+                    enc_outputs_class,
+                    enc_outputs_coord_logits,
+                    encoder_logits,
+                    encoder_pred_boxes,
+                ]
+                if value is not None
+            )
+            tuple_outputs = (
+                (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
+            )
+
+            return tuple_outputs
+
+        return GroundingDinoModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            init_reference_points=init_reference_points,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
+            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
+            encoder_vision_hidden_states=encoder_outputs.vision_hidden_states,
+            encoder_text_hidden_states=encoder_outputs.text_hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+            encoder_logits=encoder_logits,
+            encoder_pred_boxes=encoder_pred_boxes,
+        )
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
+class GroundingDinoMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def build_label_maps(logits: torch.FloatTensor, input_ids: torch.LongTensor) -> tuple[torch.FloatTensor]:
+    """
+    Computes a mapping between tokens and their corresponding labels, where `num_labels` is determined by the number of classes in the input prompt.
+    The function identifies segments of tokens between specific delimiter tokens and generates label maps for those segments.
+    Args:
+        logits (`torch.Tensor` of shape `(batch_size, seq_length, hidden_size)`):
+            The output logits from the model, where `hidden_size` corresponds to the dimension of the model's output features.
+
+        input_ids (`torch.Tensor` of shape `(batch_size, seq_length)`):
+            The input token IDs corresponding to the input prompt. For example, given the prompt "fish. shark.",
+            `input_ids` might look like `[101, 3869, 1012, 11420, 1012, 102]` where each number corresponds to a token including special tokens.
+    Returns:
+        tuple: A tuple containing label maps for each instance in the batch.
+        - label_maps (tuple of `torch.Tensor`):
+            A tuple of tensors, where each tensor in the tuple corresponds to an instance in the batch. Each tensor
+            has shape `(num_labels, hidden_size)` and contains binary values (0 or 1), where `1` indicates the tokens
+            that are associated with a specific label (class) between delimiter tokens, and `0` elsewhere.
+    Example:
+        Given an input prompt "fish. shark." and corresponding `input_ids` as `[101, 3869, 1012, 11420, 1012, 102]`:
+        - The function identifies the tokens for "fish" (IDs `[3869]`) and "shark" (IDs `[11420]`).
+        - The function then constructs label maps for these tokens, where each label map indicates which tokens
+          correspond to which label between the delimiter tokens (e.g., between the period `.`).
+        - The output is a tuple of label maps, one for each instance in the batch.
+    Note:
+        - `SPECIAL_TOKENS` should be a predefined list of tokens that are considered special (e.g., `[CLS]`, `[SEP]`, etc.).
+    """
+    max_seq_len = logits.shape[-1]
+    # Add [PAD] token to the list of special tokens
+    delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
+
+    delimiter_token_masks = torch.isin(input_ids, delimiter_tokens)
+    label_groups = torch.cumsum(delimiter_token_masks, dim=1) * (~delimiter_token_masks).to(torch.int32)
+
+    label_maps = ()
+
+    # Iterate over batch dimension as we can have different number of labels
+    for label_group in label_groups:
+        # `label_group` is a tensor of shape `(seq_len,)` with zeros for non-label tokens and integers for label tokens
+        # label tokens with same integer value are part of the same label group
+
+        # Get unique labels and exclude 0 (i.e. non-label tokens)
+        unique_labels = torch.unique(label_group)[1:, None]
+        num_labels = unique_labels.shape[0]
+
+        # Create one-hot encoding for each label group
+        label_map = label_group.unsqueeze(0).repeat(num_labels, 1)
+        label_map = torch.where(label_map == unique_labels, 1, 0)
+
+        # Pad label_map to match `max_seq_len`
+        label_map = F.pad(label_map, (0, max_seq_len - label_map.shape[1]), value=0)
+
+        label_maps += (label_map,)
+
+    return label_maps
+
+
+def build_text_mask(logits, attention_mask):
+    """
+    Create text_mask based on the matching indices
+    """
+    seq_len = attention_mask.shape[1]
+    text_mask = torch.zeros_like(logits, device=logits.device, dtype=attention_mask.dtype)
+    text_mask[:, :, :seq_len] = attention_mask[:, None, :]
+
+    return text_mask.bool()
+
+
+@auto_docstring(
+    custom_intro="""
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
+    for tasks such as COCO detection.
+    """
+)
+class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    # the bbox_embed in the decoder are all clones though
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"model\.decoder\.bbox_embed\.[0-9]\d*"]
+
+    def __init__(self, config: GroundingDinoConfig):
+        super().__init__(config)
+
+        self.model = GroundingDinoModel(config)
+        _class_embed = GroundingDinoContrastiveEmbedding(config)
+
+        if config.decoder_bbox_embed_share:
+            # a single shared instance
+            shared_head = GroundingDinoMLPPredictionHead(
+                input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+            )
+            self.bbox_embed = nn.ModuleList([shared_head] * config.decoder_layers)
+        else:
+            # each layer has its own head (implicit deep copy through a new instance)
+            self.bbox_embed = nn.ModuleList(
+                [
+                    GroundingDinoMLPPredictionHead(
+                        input_dim=config.d_model,
+                        hidden_dim=config.d_model,
+                        output_dim=4,
+                        num_layers=3,
+                    )
+                    for _ in range(config.decoder_layers)
+                ]
+            )
+
+        self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)])
+        # hack for box-refinement
+        self.model.decoder.bbox_embed = self.bbox_embed
+        # hack implementation for two-stage
+        self.model.decoder.class_embed = self.class_embed
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, tuple]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[list[dict[str, Union[torch.LongTensor, torch.FloatTensor]]]] = None,
+    ):
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> import requests
+
+        >>> import torch
+        >>> from PIL import Image
+        >>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+        >>> model_id = "IDEA-Research/grounding-dino-tiny"
+        >>> device = "cuda"
+
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+        >>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+
+        >>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(image_url, stream=True).raw)
+        >>> # Check for cats and remote controls
+        >>> text_labels = [["a cat", "a remote control"]]
+
+        >>> inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> results = processor.post_process_grounded_object_detection(
+        ...     outputs,
+        ...     threshold=0.4,
+        ...     text_threshold=0.3,
+        ...     target_sizes=[(image.height, image.width)]
+        ... )
+        >>> # Retrieve the first image result
+        >>> result = results[0]
+        >>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
+        ...     box = [round(x, 2) for x in box.tolist()]
+        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
+        Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
+        Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
+        Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference_points = outputs.init_reference_points if return_dict else outputs[1]
+        inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        # hidden_states are of shape (batch_size, num_stages, height, width)
+        # predict class and bounding box deltas for each stage
+        num_levels = hidden_states.shape[1]
+        for level in range(num_levels):
+            if level == 0:
+                reference = init_reference_points
+            else:
+                reference = inter_references_points[:, level - 1]
+            reference = torch.special.logit(reference, eps=1e-5)
+            outputs_class = self.class_embed[level](
+                vision_hidden_state=hidden_states[:, level],
+                text_hidden_state=enc_text_hidden_state,
+                text_token_mask=attention_mask.bool(),
+            )
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+
+            reference_coordinates = reference.shape[-1]
+            if reference_coordinates == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference_coordinates == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            label_maps = build_label_maps(logits, input_ids)
+            text_mask = build_text_mask(logits, attention_mask)
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                label_maps,
+                text_mask,
+                outputs_class=outputs_class,
+                outputs_coord=outputs_coord,
+                encoder_logits=outputs[-2],
+                encoder_pred_boxes=outputs[-1],
+            )
+
+        if not return_dict:
+            auxiliary_outputs = auxiliary_outputs if auxiliary_outputs is not None else []
+            output = [loss, loss_dict, logits, pred_boxes, *auxiliary_outputs, *outputs, input_ids]
+            output = tuple(out for out in output if out is not None)
+            return output
+
+        dict_outputs = GroundingDinoObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            last_hidden_state=outputs.last_hidden_state,
+            auxiliary_outputs=auxiliary_outputs,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
+            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
+            encoder_vision_hidden_states=outputs.encoder_vision_hidden_states,
+            encoder_text_hidden_states=outputs.encoder_text_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            encoder_logits=outputs.encoder_logits,
+            encoder_pred_boxes=outputs.encoder_pred_boxes,
+            input_ids=input_ids,
+        )
+
+        return dict_outputs
+
+
+__all__ = ["GroundingDinoForObjectDetection", "GroundingDinoModel", "GroundingDinoPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/modular_grounding_dino.py b/phivenv/Lib/site-packages/transformers/models/grounding_dino/modular_grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..e49a79fddee88fa042dc3b219ef6d367bd2cbb1c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/grounding_dino/modular_grounding_dino.py
@@ -0,0 +1,127 @@
+from typing import TYPE_CHECKING, Optional, Union
+
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
+
+from ...image_transforms import center_to_corners_format
+from ...utils import (
+    TensorType,
+    is_torch_available,
+    logging,
+)
+
+
+if TYPE_CHECKING:
+    from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+def _scale_boxes(boxes, target_sizes):
+    """
+    Scale batch of bounding boxes to the target sizes.
+
+    Args:
+        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
+        target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
+            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
+    """
+
+    if isinstance(target_sizes, (list, tuple)):
+        image_height = torch.tensor([i[0] for i in target_sizes])
+        image_width = torch.tensor([i[1] for i in target_sizes])
+    elif isinstance(target_sizes, torch.Tensor):
+        image_height, image_width = target_sizes.unbind(1)
+    else:
+        raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor")
+
+    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
+    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
+    boxes = boxes * scale_factor
+    return boxes
+
+
+class GroundingDinoImageProcessorFast(DetrImageProcessorFast):
+    def post_process_object_detection(
+        self,
+        outputs: "GroundingDinoObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+    ):
+        """
+        Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`GroundingDinoObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+        """
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
+
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
+
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        batch_boxes = center_to_corners_format(batch_boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
+
+        results = []
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
+
+        return results
+
+    def post_process():
+        raise NotImplementedError("Post-processing is not implemented for Grounding-Dino yet.")
+
+    def post_process_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.")
+
+    def post_process_instance():
+        raise NotImplementedError("Instance post-processing is not implemented for Grounding-Dino yet.")
+
+    def post_process_panoptic():
+        raise NotImplementedError("Panoptic post-processing is not implemented for Grounding-Dino yet.")
+
+    def post_process_instance_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.")
+
+    def post_process_semantic_segmentation():
+        raise NotImplementedError("Semantic segmentation post-processing is not implemented for Grounding-Dino yet.")
+
+    def post_process_panoptic_segmentation():
+        raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Grounding-Dino yet.")
+
+
+__all__ = ["GroundingDinoImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/grounding_dino/processing_grounding_dino.py b/phivenv/Lib/site-packages/transformers/models/grounding_dino/processing_grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f13589f7950feb5ff5dd46cf85c7e857dff465
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Grounding DINO.
+"""
+
+import pathlib
+import warnings
+from typing import TYPE_CHECKING, Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_transforms import center_to_corners_format
+from ...image_utils import AnnotationFormat, ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...utils import TensorType, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+if TYPE_CHECKING:
+    from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+
+
+AnnotationType = dict[str, Union[int, str, list[dict]]]
+
+
+def get_phrases_from_posmap(posmaps, input_ids):
+    """Get token ids of phrases from posmaps and input_ids.
+
+    Args:
+        posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`):
+            A boolean tensor of text-thresholded logits related to the detected bounding boxes.
+        input_ids (`torch.LongTensor`) of shape `(sequence_length, )`):
+            A tensor of token ids.
+    """
+    left_idx = 0
+    right_idx = posmaps.shape[-1] - 1
+
+    # Avoiding altering the input tensor
+    posmaps = posmaps.clone()
+
+    posmaps[:, 0 : left_idx + 1] = False
+    posmaps[:, right_idx:] = False
+
+    token_ids = []
+    for posmap in posmaps:
+        non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
+        token_ids.append([input_ids[i] for i in non_zero_idx])
+
+    return token_ids
+
+
+def _is_list_of_candidate_labels(text) -> bool:
+    """Check that text is list/tuple of strings and each string is a candidate label and not merged candidate labels text.
+    Merged candidate labels text is a string with candidate labels separated by a dot.
+    """
+    if isinstance(text, (list, tuple)):
+        return all(isinstance(t, str) and "." not in t for t in text)
+    return False
+
+
+def _merge_candidate_labels_text(text: list[str]) -> str:
+    """
+    Merge candidate labels text into a single string. Ensure all labels are lowercase.
+    For example, ["A cat", "a dog"] -> "a cat. a dog."
+    """
+    labels = [t.strip().lower() for t in text]  # ensure lowercase
+    merged_labels_str = ". ".join(labels) + "."  # join with dot and add a dot at the end
+    return merged_labels_str
+
+
+class DictWithDeprecationWarning(dict):
+    message = (
+        "The key `labels` is will return integer ids in `GroundingDinoProcessor.post_process_grounded_object_detection` "
+        "output since v4.51.0. Use `text_labels` instead to retrieve string object names."
+    )
+
+    def __getitem__(self, key):
+        if key == "labels":
+            warnings.warn(self.message, FutureWarning)
+        return super().__getitem__(key)
+
+    def get(self, key, *args, **kwargs):
+        if key == "labels":
+            warnings.warn(self.message, FutureWarning)
+        return super().get(key, *args, **kwargs)
+
+
+class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    return_segmentation_masks: Optional[bool]
+    masks_path: Optional[Union[str, pathlib.Path]]
+    do_convert_annotations: Optional[bool]
+    format: Optional[Union[str, AnnotationFormat]]
+
+
+class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GroundingDinoImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": True,
+            "return_length": False,
+            "verbose": True,
+        }
+    }
+
+
+class GroundingDinoProcessor(ProcessorMixin):
+    r"""
+    Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
+    single processor.
+
+    [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and
+    [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`]
+    for more information.
+
+    Args:
+        image_processor (`GroundingDinoImageProcessor`):
+            An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "GroundingDinoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[GroundingDinoProcessorKwargs],
+    ) -> BatchEncoding:
+        """
+        This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Args:
+            images (`ImageInput`, `list[ImageInput]`, *optional*):
+                The image or batch of images to be processed. The image might be either PIL image, numpy array or a torch tensor.
+            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
+                Candidate labels to be detected on the image. The text might be one of the following:
+                - A list of candidate labels (strings) to be detected on the image (e.g. ["a cat", "a dog"]).
+                - A batch of candidate labels to be detected on the batch of images (e.g. [["a cat", "a dog"], ["a car", "a person"]]).
+                - A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog.").
+                - A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]).
+        """
+        if images is None and text is None:
+            raise ValueError("You must specify either text or images.")
+
+        output_kwargs = self._merge_kwargs(
+            GroundingDinoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Get only text
+        if images is not None:
+            encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            encoding_image_processor = BatchFeature()
+
+        if text is not None:
+            text = self._preprocess_input_text(text)
+            text_encoding = self.tokenizer(
+                text=text,
+                **output_kwargs["text_kwargs"],
+            )
+        else:
+            text_encoding = BatchEncoding()
+
+        text_encoding.update(encoding_image_processor)
+
+        return text_encoding
+
+    def _preprocess_input_text(self, text):
+        """
+        Preprocess input text to ensure that labels are in the correct format for the model.
+        If the text is a list of candidate labels, merge the candidate labels into a single string,
+        for example, ["a cat", "a dog"] -> "a cat. a dog.". In case candidate labels are already in a form of
+        "a cat. a dog.", the text is returned as is.
+        """
+
+        if _is_list_of_candidate_labels(text):
+            text = _merge_candidate_labels_text(text)
+
+        # for batched input
+        elif isinstance(text, (list, tuple)) and all(_is_list_of_candidate_labels(t) for t in text):
+            text = [_merge_candidate_labels_text(sample) for sample in text]
+
+        return text
+
+    def post_process_grounded_object_detection(
+        self,
+        outputs: "GroundingDinoObjectDetectionOutput",
+        input_ids: Optional[TensorType] = None,
+        threshold: float = 0.25,
+        text_threshold: float = 0.25,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+        text_labels: Optional[list[list[str]]] = None,
+    ):
+        """
+        Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format and get the associated text label.
+
+        Args:
+            outputs ([`GroundingDinoObjectDetectionOutput`]):
+                Raw outputs of the model.
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                The token ids of the input text. If not provided will be taken from the model output.
+            threshold (`float`, *optional*, defaults to 0.25):
+                Threshold to keep object detection predictions based on confidence score.
+            text_threshold (`float`, *optional*, defaults to 0.25):
+                Score threshold to keep text detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            text_labels (`list[list[str]]`, *optional*):
+                List of candidate labels to be detected on each image. At the moment it's *NOT used*, but required
+                to be in signature for the zero-shot object detection pipeline. Text labels are instead extracted
+                from the `input_ids` tensor provided in `outputs`.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the
+                - **scores**: tensor of confidence scores for detected objects
+                - **boxes**: tensor of bounding boxes in [x0, y0, x1, y1] format
+                - **labels**: list of text labels for each detected object (will be replaced with integer ids in v4.51.0)
+                - **text_labels**: list of text labels for detected objects
+        """
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        input_ids = input_ids if input_ids is not None else outputs.input_ids
+
+        if target_sizes is not None and len(target_sizes) != len(batch_logits):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+
+        batch_probs = torch.sigmoid(batch_logits)  # (batch_size, num_queries, 256)
+        batch_scores = torch.max(batch_probs, dim=-1)[0]  # (batch_size, num_queries)
+
+        # Convert to [x0, y0, x1, y1] format
+        batch_boxes = center_to_corners_format(batch_boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, list):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(batch_boxes.device)
+            batch_boxes = batch_boxes * scale_fct[:, None, :]
+
+        results = []
+        for idx, (scores, boxes, probs) in enumerate(zip(batch_scores, batch_boxes, batch_probs)):
+            keep = scores > threshold
+            scores = scores[keep]
+            boxes = boxes[keep]
+
+            # extract text labels
+            prob = probs[keep]
+            label_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx])
+            objects_text_labels = self.batch_decode(label_ids)
+
+            result = DictWithDeprecationWarning(
+                {
+                    "scores": scores,
+                    "boxes": boxes,
+                    "text_labels": objects_text_labels,
+                    # TODO: @pavel, set labels to None since v4.51.0 or find a way to extract ids
+                    "labels": objects_text_labels,
+                }
+            )
+            results.append(result)
+
+        return results
+
+
+__all__ = ["GroundingDinoProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/__init__.py b/phivenv/Lib/site-packages/transformers/models/groupvit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab7fa27d09d16590d6ba25185c9ef9c4974e2ea1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/groupvit/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_groupvit import *
+    from .modeling_groupvit import *
+    from .modeling_tf_groupvit import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef5b69146eb4c4b328881c1b36a96428e198d1ae
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56319136a42c8f23872375e39040b4ae8beb9e50
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a00a1877c68ce84f3981e155f0da42e37caa4ba5
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_tf_groupvit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_tf_groupvit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af0f440b16a1517f3ddd8fb3bb7899084a9b3f9e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/groupvit/__pycache__/modeling_tf_groupvit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/configuration_groupvit.py b/phivenv/Lib/site-packages/transformers/models/groupvit/configuration_groupvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d17288ede723c916d7cb1c6fe660ee68b7202788
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/groupvit/configuration_groupvit.py
@@ -0,0 +1,407 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GroupViT model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+
+logger = logging.get_logger(__name__)
+
+
+class GroupViTTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GroupViTTextModel`]. It is used to instantiate an
+    GroupViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the GroupViT
+    [nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the GroupViT text model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`GroupViTModel`].
+        hidden_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import GroupViTTextConfig, GroupViTTextModel
+
+    >>> # Initializing a GroupViTTextModel with nvidia/groupvit-gcc-yfcc style configuration
+    >>> configuration = GroupViTTextConfig()
+
+    >>> model = GroupViTTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "groupvit_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=256,
+        intermediate_size=1024,
+        num_hidden_layers=12,
+        num_attention_heads=4,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+
+class GroupViTVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GroupViTVisionModel`]. It is used to instantiate
+    an GroupViT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GroupViT
+    [nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 384):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        depths (`list[int]`, *optional*, defaults to [6, 3, 3]):
+            The number of layers in each encoder block.
+        num_group_tokens (`list[int]`, *optional*, defaults to [64, 8, 0]):
+            The number of group tokens for each stage.
+        num_output_groups (`list[int]`, *optional*, defaults to [64, 8, 8]):
+            The number of output groups for each stage, 0 means no group.
+        num_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import GroupViTVisionConfig, GroupViTVisionModel
+
+    >>> # Initializing a GroupViTVisionModel with nvidia/groupvit-gcc-yfcc style configuration
+    >>> configuration = GroupViTVisionConfig()
+
+    >>> model = GroupViTVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "groupvit_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=384,
+        intermediate_size=1536,
+        depths=[6, 3, 3],
+        num_hidden_layers=12,
+        num_group_tokens=[64, 8, 0],
+        num_output_groups=[64, 8, 8],
+        num_attention_heads=6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        assign_eps=1.0,
+        assign_mlp_ratio=[0.5, 4],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.depths = depths
+        if num_hidden_layers != sum(depths):
+            logger.warning(
+                f"Manually setting num_hidden_layers to {num_hidden_layers}, but we expect num_hidden_layers ="
+                f" sum(depth) = {sum(depths)}"
+            )
+        self.num_hidden_layers = num_hidden_layers
+        self.num_group_tokens = num_group_tokens
+        self.num_output_groups = num_output_groups
+        self.num_attention_heads = num_attention_heads
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.assign_eps = assign_eps
+        self.assign_mlp_ratio = assign_mlp_ratio
+
+
+class GroupViTConfig(PretrainedConfig):
+    r"""
+    [`GroupViTConfig`] is the configuration class to store the configuration of a [`GroupViTModel`]. It is used to
+    instantiate a GroupViT model according to the specified arguments, defining the text model and vision model
+    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the GroupViT
+    [nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`GroupViTTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`GroupViTVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of text and vision projection layers.
+        projection_intermediate_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of intermediate layer of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original GroupViT
+            implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = "groupvit"
+    sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=256,
+        projection_intermediate_dim=4096,
+        logit_scale_init_value=2.6592,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = GroupViTTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `GroupViTTextConfig`. "
+                            f'The value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = GroupViTVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `GroupViTVisionConfig`."
+                            f' The value `vision_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `GroupViTTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `GroupViTVisionConfig` with default values.")
+
+        self.text_config = GroupViTTextConfig(**text_config)
+        self.vision_config = GroupViTVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.projection_intermediate_dim = projection_intermediate_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_range = 0.02
+        self.initializer_factor = 1.0
+        self.output_segmentation = False
+
+
+class GroupViTOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
+
+
+__all__ = ["GroupViTConfig", "GroupViTOnnxConfig", "GroupViTTextConfig", "GroupViTVisionConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/modeling_groupvit.py b/phivenv/Lib/site-packages/transformers/models/groupvit/modeling_groupvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9673a128fa8d5a01ea8612d0ebb7fdf7cb93927
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/groupvit/modeling_groupvit.py
@@ -0,0 +1,1464 @@
+# coding=utf-8
+# Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GroupViT model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->groupvit
+def groupvit_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+def hard_softmax(logits: torch.Tensor, dim: int):
+    y_soft = logits.softmax(dim)
+    # Straight through.
+    index = y_soft.max(dim, keepdim=True)[1]
+    y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+    ret = y_hard - y_soft.detach() + y_soft
+
+    return ret
+
+
+def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim: int = -1) -> torch.Tensor:
+    # more stable https://github.com/pytorch/pytorch/issues/41663
+    gumbel_dist = torch.distributions.gumbel.Gumbel(
+        torch.tensor(0.0, device=logits.device, dtype=logits.dtype),
+        torch.tensor(1.0, device=logits.device, dtype=logits.dtype),
+    )
+    gumbels = gumbel_dist.sample(logits.shape)
+
+    gumbels = (logits + gumbels) / tau  # ~Gumbel(logits,tau)
+    y_soft = gumbels.softmax(dim)
+
+    if hard:
+        # Straight through.
+        index = y_soft.max(dim, keepdim=True)[1]
+        y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+        ret = y_hard - y_soft.detach() + y_soft
+    else:
+        # Reparametrization trick.
+        ret = y_soft
+    return ret
+
+
+def resize_attention_map(attentions, height, width, align_corners=False):
+    """
+    Args:
+        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
+        height (`int`): height of the output attention map
+        width (`int`): width of the output attention map
+        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.
+
+    Returns:
+        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
+    """
+
+    scale = (height * width // attentions.shape[2]) ** 0.5
+    if height > width:
+        feat_width = int(np.round(width / scale))
+        feat_height = attentions.shape[2] // feat_width
+    else:
+        feat_height = int(np.round(height / scale))
+        feat_width = attentions.shape[2] // feat_height
+
+    batch_size = attentions.shape[0]
+    groups = attentions.shape[1]  # number of group token
+    # [batch_size, groups, height*width, groups] -> [batch_size, groups, height, width]
+    attentions = attentions.reshape(batch_size, groups, feat_height, feat_width)
+    attentions = nn.functional.interpolate(
+        attentions, size=(height, width), mode="bilinear", align_corners=align_corners
+    )
+    return attentions
+
+
+def get_grouping_from_attentions(attentions, hw_shape):
+    """
+    Args:
+        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
+        hw_shape (`tuple(int)`): height and width of the output attention map
+    Returns:
+        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
+    """
+
+    attn_maps = []
+    with torch.no_grad():
+        prev_attn_masks = None
+        for attn_masks in attentions:
+            # [batch_size, num_groups, height x width] -> [batch_size, height x width, num_groups]
+            attn_masks = attn_masks.permute(0, 2, 1).contiguous()
+            if prev_attn_masks is None:
+                prev_attn_masks = attn_masks
+            else:
+                prev_attn_masks = prev_attn_masks @ attn_masks
+            # [batch_size, heightxwidth, num_groups] -> [batch_size, num_groups, heightxwidth] -> [batch_size, num_groups, height, width]
+            cur_attn_map = resize_attention_map(prev_attn_masks.permute(0, 2, 1).contiguous(), *hw_shape)
+            attn_maps.append(cur_attn_map)
+
+    # [batch_size, num_groups, height, width]
+    final_grouping = attn_maps[-1]
+
+    return final_grouping
+
+
+class GroupViTCrossAttentionLayer(nn.Module):
+    def __init__(self, config: GroupViTVisionConfig):
+        super().__init__()
+        self.attn = GroupViTAttention(config)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = GroupViTMLP(config)
+        self.norm_post = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, query, key):
+        x = query
+        x = x + self.attn(query, encoder_hidden_states=key)[0]
+        x = x + self.mlp(self.norm2(x))
+        x = self.norm_post(x)
+        return x
+
+
+class GroupViTAssignAttention(nn.Module):
+    def __init__(self, config: GroupViTVisionConfig):
+        super().__init__()
+        self.scale = config.hidden_size**-0.5
+
+        self.q_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.k_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.v_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.assign_eps = config.assign_eps
+
+    def get_attn(self, attn, gumbel=True, hard=True):
+        if gumbel and self.training:
+            attn = gumbel_softmax(attn, dim=-2, hard=hard)
+        else:
+            if hard:
+                attn = hard_softmax(attn, dim=-2)
+            else:
+                attn = nn.functional.softmax(attn, dim=-2)
+
+        return attn
+
+    def forward(self, query, key):
+        value = key
+        # [batch_size, query_length, channels]
+        query = self.q_proj(query)
+
+        # [batch_size, key_length, channels]
+        key = self.k_proj(key)
+
+        # [batch_size, key_length, channels]
+        value = self.v_proj(value)
+
+        # [batch_size, query_length, key_length]
+        raw_attn = (query @ key.transpose(-2, -1)) * self.scale
+
+        attn = self.get_attn(raw_attn)
+        soft_attn = self.get_attn(raw_attn, gumbel=False, hard=False)
+
+        attn = attn / (attn.sum(dim=-1, keepdim=True) + self.assign_eps)
+
+        out = attn @ value
+
+        out = self.proj(out)
+
+        return out, soft_attn
+
+
+class GroupViTTokenAssign(nn.Module):
+    def __init__(self, config: GroupViTVisionConfig, num_group_token, num_output_group):
+        super().__init__()
+        self.num_output_group = num_output_group
+        # norm on group_tokens
+        self.norm_tokens = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        assign_mlp_ratio = (
+            config.assign_mlp_ratio
+            if isinstance(config.assign_mlp_ratio, collections.abc.Iterable)
+            else (config.assign_mlp_ratio, config.assign_mlp_ratio)
+        )
+        tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio]
+        self.mlp_inter = GroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group)
+        self.norm_post_tokens = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # norm on x
+        self.norm_x = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pre_assign_attn = GroupViTCrossAttentionLayer(config)
+
+        self.assign = GroupViTAssignAttention(config)
+        self.norm_new_x = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp_channels = GroupViTMLP(config, config.hidden_size, channels_dim, config.hidden_size)
+
+    def project_group_token(self, group_tokens):
+        """
+        Args:
+            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]
+
+        Returns:
+            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
+        """
+        # [B, num_output_groups, C] <- [B, num_group_tokens, C]
+        projected_group_tokens = self.mlp_inter(group_tokens)
+        projected_group_tokens = self.norm_post_tokens(projected_group_tokens)
+        return projected_group_tokens
+
+    def forward(self, image_tokens, group_tokens):
+        """
+        Args:
+            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
+            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
+        """
+
+        group_tokens = self.norm_tokens(group_tokens)
+        image_tokens = self.norm_x(image_tokens)
+        # [batch_size, num_output_groups, channels]
+        projected_group_tokens = self.project_group_token(group_tokens)
+        projected_group_tokens = self.pre_assign_attn(projected_group_tokens, image_tokens)
+        new_image_tokens, attention = self.assign(projected_group_tokens, image_tokens)
+        new_image_tokens += projected_group_tokens
+
+        new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens))
+
+        return new_image_tokens, attention
+
+
+@dataclass
+@auto_docstring
+class GroupViTModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+        Contrastive loss for image-text similarity.
+    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+        similarity scores.
+    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+        similarity scores.
+    segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
+        Classification scores for each pixel.
+
+        <Tip warning={true}>
+
+        The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
+        to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
+        original image size as post-processing. You should always check your logits shape and resize as needed.
+
+        </Tip>
+    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The text embeddings obtained by applying the projection layer to the pooled output of
+        [`GroupViTTextModel`].
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The image embeddings obtained by applying the projection layer to the pooled output of
+        [`GroupViTVisionModel`].
+    text_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`GroupViTTextModel`].
+    vision_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`GroupViTVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    segmentation_logits: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class GroupViTPatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        patch_size: Union[int, tuple[int, int]] = 16,
+        num_channels: int = 3,
+        embed_dim: int = 768,
+    ):
+        super().__init__()
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+
+
+class GroupViTVisionEmbeddings(nn.Module):
+    def __init__(self, config: GroupViTVisionConfig):
+        super().__init__()
+
+        self.patch_embeddings = GroupViTPatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches, config.hidden_size))
+        self.dropout = nn.Dropout(config.dropout)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and no class embeddings.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embeddings.shape[1]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        patch_pos_embed = self.position_embeddings
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        embeddings = self.layernorm(embeddings)
+
+        batch_size, seq_len, _ = embeddings.size()
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->GroupViT
+class GroupViTTextEmbeddings(nn.Module):
+    def __init__(self, config: GroupViTTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class GroupViTStage(nn.Module):
+    """This corresponds to the `GroupingLayer` class in the GroupViT implementation."""
+
+    def __init__(
+        self,
+        config: GroupViTVisionConfig,
+        depth: int,
+        num_prev_group_token: int,
+        num_group_token: int,
+        num_output_group: int,
+    ):
+        super().__init__()
+        self.depth = depth
+        self.num_group_token = num_group_token
+        if num_group_token > 0:
+            self.group_token = nn.Parameter(torch.zeros(1, num_group_token, config.hidden_size))
+        else:
+            self.group_token = None
+        self.layers = nn.ModuleList([GroupViTEncoderLayer(config) for _ in range(depth)])
+
+        if num_group_token > 0:
+            self.downsample = GroupViTTokenAssign(
+                config=config,
+                num_group_token=num_group_token,
+                num_output_group=num_output_group,
+            )
+        else:
+            self.downsample = None
+
+        if num_prev_group_token > 0 and num_group_token > 0:
+            self.group_projector = nn.Sequential(
+                nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps),
+                GroupViTMixerMLP(config, num_prev_group_token, config.hidden_size // 2, num_group_token),
+            )
+        else:
+            self.group_projector = None
+
+    @property
+    def with_group_token(self):
+        return self.group_token is not None
+
+    def split_x(self, x):
+        if self.with_group_token:
+            return x[:, : -self.num_group_token], x[:, -self.num_group_token :]
+        else:
+            return x, None
+
+    def concat_x(self, x: torch.Tensor, group_token: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if group_token is None:
+            return x
+        return torch.cat([x, group_token], dim=1)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        prev_group_token: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the grouping tensors of Grouping block.
+        """
+        if self.with_group_token:
+            group_token = self.group_token.expand(hidden_states.size(0), -1, -1)
+            if self.group_projector is not None:
+                group_token = group_token + self.group_projector(prev_group_token)
+        else:
+            group_token = None
+
+        x = hidden_states
+
+        cat_x = self.concat_x(x, group_token)
+        for layer in self.layers:
+            layer_out = layer(cat_x, attention_mask=None, causal_attention_mask=None)
+            cat_x = layer_out[0]
+
+        x, group_token = self.split_x(cat_x)
+
+        attention = None
+        if self.downsample is not None:
+            x, attention = self.downsample(x, group_token)
+
+        outputs = (x, group_token)
+        if output_attentions:
+            outputs = outputs + (attention,)
+
+        return outputs
+
+
+class GroupViTMLP(nn.Module):
+    def __init__(
+        self,
+        config: GroupViTVisionConfig,
+        hidden_size: Optional[int] = None,
+        intermediate_size: Optional[int] = None,
+        output_size: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+        intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        output_size = output_size if output_size is not None else hidden_size
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.fc2 = nn.Linear(intermediate_size, output_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class GroupViTMixerMLP(GroupViTMLP):
+    def forward(self, x):
+        x = super().forward(x.transpose(1, 2))
+        return x.transpose(1, 2)
+
+
+class GroupViTAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        is_cross_attention = encoder_hidden_states is not None
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        if is_cross_attention:
+            key_states = self._shape(self.k_proj(encoder_hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(encoder_hidden_states), -1, bsz)
+        else:
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->GroupViT
+class GroupViTEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GroupViTConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = GroupViTAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = GroupViTMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class GroupViTPreTrainedModel(PreTrainedModel):
+    config: GroupViTConfig
+    base_model_prefix = "groupvit"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        init_range = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=init_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        factor = self.config.initializer_factor
+        if isinstance(module, GroupViTTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, GroupViTAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, GroupViTMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+
+
+class GroupViTVisionEncoder(nn.Module):
+    def __init__(self, config: GroupViTVisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.stages = nn.ModuleList(
+            [
+                GroupViTStage(
+                    config=config,
+                    depth=config.depths[i],
+                    num_group_token=config.num_group_tokens[i],
+                    num_output_group=config.num_output_groups[i],
+                    num_prev_group_token=config.num_output_groups[i - 1] if i > 0 else 0,
+                )
+                for i in range(len(config.depths))
+            ]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        all_hidden_states = () if output_hidden_states else None
+        all_groupings = () if output_attentions else None
+
+        group_tokens = None
+
+        for i, stage in enumerate(self.stages):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = stage(hidden_states, group_tokens, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            group_tokens = layer_outputs[1]
+
+            if output_attentions and layer_outputs[2] is not None:
+                all_groupings = all_groupings + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_groupings] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings
+        )
+
+
+class GroupViTTextEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
+    [`GroupViTEncoderLayer`].
+
+    Args:
+        config: GroupViTTextConfig
+    """
+
+    def __init__(self, config: GroupViTTextConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([GroupViTEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class GroupViTTextTransformer(nn.Module):
+    def __init__(self, config: GroupViTTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = GroupViTTextEmbeddings(config)
+        self.encoder = GroupViTTextEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
+        )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                # Note: we assume each sequence (along batch dim.) contains an  `eos_token_id` (e.g. prepared by the tokenizer)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class GroupViTTextModel(GroupViTPreTrainedModel):
+    config: GroupViTTextConfig
+
+    def __init__(self, config: GroupViTTextConfig):
+        super().__init__(config)
+        self.text_model = GroupViTTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, GroupViTTextModel
+
+        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class GroupViTVisionTransformer(nn.Module):
+    def __init__(self, config: GroupViTVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = GroupViTVisionEmbeddings(config)
+        self.encoder = GroupViTVisionEncoder(config)
+        self.layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            hidden_states=hidden_states,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        # normalize the last hidden state
+        last_hidden_state = self.layernorm(last_hidden_state)
+        pooled_output = last_hidden_state.mean(dim=1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class GroupViTVisionModel(GroupViTPreTrainedModel):
+    config: GroupViTVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: GroupViTVisionConfig):
+        super().__init__(config)
+        self.vision_model = GroupViTVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> GroupViTPatchEmbeddings:
+        return self.vision_model.embeddings.patch_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GroupViTVisionModel
+
+        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@auto_docstring
+class GroupViTModel(GroupViTPreTrainedModel):
+    config: GroupViTConfig
+
+    def __init__(self, config: GroupViTConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, GroupViTTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type GroupViTTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, GroupViTVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type GroupViTVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.projection_intermediate_dim = config.projection_intermediate_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = GroupViTTextTransformer(text_config)
+        self.vision_model = GroupViTVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Sequential(
+            nn.Linear(self.vision_embed_dim, self.projection_intermediate_dim, bias=True),
+            nn.BatchNorm1d(self.projection_intermediate_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
+        )
+        self.text_projection = nn.Sequential(
+            nn.Linear(self.text_embed_dim, self.projection_intermediate_dim, bias=True),
+            nn.BatchNorm1d(self.projection_intermediate_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
+        )
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`GroupViTTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, GroupViTModel
+
+        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use GROUPVIT model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`GroupViTVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GroupViTModel
+
+        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use GROUPVIT model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_segmentation: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, GroupViTModelOutput]:
+        r"""
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_segmentation (`bool`, *optional*):
+            Whether or not to return the segmentation logits.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GroupViTModel
+
+        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use GROUPVIT model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_segmentation = (
+            output_segmentation if output_segmentation is not None else self.config.output_segmentation
+        )
+        if output_segmentation:
+            output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        seg_logits = None
+        if output_segmentation:
+            # grouped features
+            # [batch_size_image, num_group, hidden_size]
+            image_group_embeds = vision_outputs[0]
+            # [batch_size_image*num_group, hidden_size]
+            image_group_embeds = self.visual_projection(image_group_embeds.reshape(-1, image_group_embeds.shape[-1]))
+            if output_hidden_states:
+                attentions = vision_outputs[3]
+            else:
+                attentions = vision_outputs[2]
+            # [batch_size_image, num_group, height, width]
+            grouping = get_grouping_from_attentions(attentions, pixel_values.shape[2:])
+
+            # normalized features
+            image_group_embeds = image_group_embeds / image_group_embeds.norm(dim=-1, keepdim=True)
+            # [batch_size_image x num_group, batch_size_text]
+            logits_per_image_group = torch.matmul(image_group_embeds, text_embeds.t()) * logit_scale
+            # [batch_size_image, batch_size_text, num_group]
+            logits_per_image_group = logits_per_image_group.reshape(
+                image_embeds.shape[0], -1, text_embeds.shape[0]
+            ).permute(0, 2, 1)
+
+            # [batch_size_image, batch_size_text, height x width]
+            flatten_grouping = grouping.reshape(grouping.shape[0], grouping.shape[1], -1)
+
+            # [batch_size_image, batch_size_text, height, width]
+            seg_logits = torch.matmul(logits_per_image_group, flatten_grouping) * logit_scale
+            seg_logits = seg_logits.reshape(
+                seg_logits.shape[0], seg_logits.shape[1], grouping.shape[2], grouping.shape[3]
+            )
+
+        loss = None
+        if return_loss:
+            loss = groupvit_loss(logits_per_text)
+
+        if not return_dict:
+            if seg_logits is not None:
+                output = (
+                    logits_per_image,
+                    logits_per_text,
+                    seg_logits,
+                    text_embeds,
+                    image_embeds,
+                    text_outputs,
+                    vision_outputs,
+                )
+            else:
+                output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return GroupViTModelOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            segmentation_logits=seg_logits,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+__all__ = ["GroupViTModel", "GroupViTPreTrainedModel", "GroupViTTextModel", "GroupViTVisionModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/groupvit/modeling_tf_groupvit.py b/phivenv/Lib/site-packages/transformers/models/groupvit/modeling_tf_groupvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fdfe94e40e00b46ce033a907899bfa937f0dd2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -0,0 +1,2141 @@
+# coding=utf-8
+# Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 GroupViT model."""
+
+from __future__ import annotations
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_tensorflow_probability_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+# soft dependency
+if is_tensorflow_probability_available():
+    try:
+        import tensorflow_probability as tfp
+
+        # On the first call, check whether a compatible version of TensorFlow is installed
+        # TensorFlow Probability depends on a recent stable release of TensorFlow
+        _ = tfp.distributions.Normal(loc=0.0, scale=1.0)
+    except ImportError:
+        logger.error(
+            "GroupViT models are not usable since `tensorflow_probability` can't be loaded. "
+            "It seems you have `tensorflow_probability` installed with the wrong tensorflow version."
+            "Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability."
+        )
+else:
+    try:
+        import tensorflow_probability as tfp
+
+        # On the first call, check whether a compatible version of TensorFlow is installed
+        # TensorFlow Probability depends on a recent stable release of TensorFlow
+        _ = tfp.distributions.Normal(loc=0.0, scale=1.0)
+    except ImportError:
+        pass
+
+_CHECKPOINT_FOR_DOC = "nvidia/groupvit-gcc-yfcc"
+
+
+LARGE_NEGATIVE = -1e8
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: int | None = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
+    return tf.math.reduce_mean(
+        keras.metrics.sparse_categorical_crossentropy(
+            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
+        )
+    )
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.clip_loss with clip->groupvit
+def groupvit_loss(similarity: tf.Tensor) -> tf.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(tf.transpose(similarity))
+    return (caption_loss + image_loss) / 2.0
+
+
+def hard_softmax(logits: tf.Tensor, dim: int) -> tf.Tensor:
+    y_soft = stable_softmax(logits, dim)
+    # Straight through.
+    index = tf.argmax(y_soft, dim)
+    y_hard = tf.one_hot(
+        index,
+        depth=shape_list(logits)[dim],
+        # TensorFlow expects axis to be -1 or between [0, 3).  But received: -2
+        # This is why the following code snippet is used.
+        axis=range(len(shape_list(logits)))[dim],
+        dtype=y_soft.dtype,
+    )
+    ret = y_hard - tf.stop_gradient(y_soft) + y_soft
+
+    return ret
+
+
+def gumbel_softmax(logits: tf.Tensor, tau: float = 1, hard: bool = False, dim: int = -1) -> tf.Tensor:
+    gumbel_dist = tfp.distributions.Gumbel(0.0, 1.0)
+    gumbels = gumbel_dist.sample(tf.shape(logits), dtype=logits.dtype)
+
+    gumbels = (logits + gumbels) / tau  # ~Gumbel(logits,tau)
+    y_soft = stable_softmax(gumbels, dim)
+
+    if hard:
+        # Straight through.
+        index = tf.argmax(y_soft, dim)
+        y_hard = tf.one_hot(
+            index,
+            depth=shape_list(logits)[dim],
+            # TensorFlow expects axis to be -1 or between [0, 3).  But received: -2
+            # This is why the following code snippet is used.
+            axis=range(len(shape_list(logits)))[dim],
+            dtype=y_soft.dtype,
+        )
+        ret = y_hard - tf.stop_gradient(y_soft) + y_soft
+    else:
+        # Reparametrization trick.
+        ret = y_soft
+    return ret
+
+
+def resize_attention_map(attentions: tf.Tensor, height: int, width: int, align_corners: bool = False) -> tf.Tensor:
+    """
+    Args:
+        attentions (`tf.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
+        height (`int`): height of the output attention map
+        width (`int`): width of the output attention map
+        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.
+
+    Returns:
+        `tf.Tensor`: resized attention map of shape [batch_size, groups, height, width]
+    """
+
+    scale = (height * width // attentions.shape[2]) ** 0.5
+    if height > width:
+        feat_width = int(np.round(width / scale))
+        feat_height = shape_list(attentions)[2] // feat_width
+    else:
+        feat_height = int(np.round(height / scale))
+        feat_width = shape_list(attentions)[2] // feat_height
+
+    batch_size = shape_list(attentions)[0]
+    groups = shape_list(attentions)[1]  # number of group token
+    # [batch_size, groups, height x width, groups] -> [batch_size, groups, height, width]
+    attentions = tf.reshape(attentions, (batch_size, groups, feat_height, feat_width))
+    attentions = tf.transpose(attentions, perm=(0, 2, 3, 1))
+    if align_corners:
+        attentions = tf.compat.v1.image.resize(
+            attentions,
+            size=(height, width),
+            method="bilinear",
+            align_corners=align_corners,
+        )
+    else:
+        attentions = tf.image.resize(attentions, size=(height, width), method="bilinear")
+    attentions = tf.transpose(attentions, perm=(0, 3, 1, 2))
+    return attentions
+
+
+def get_grouping_from_attentions(attentions: tuple[tf.Tensor], hw_shape: tuple[int]) -> tf.Tensor:
+    """
+    Args:
+        attentions (`tuple(tf.Tensor)`: tuple of attention maps returned by `TFGroupViTVisionTransformer`
+        hw_shape (`tuple(int)`): height and width of the output attention map
+    Returns:
+        `tf.Tensor`: the attention map of shape [batch_size, groups, height, width]
+    """
+
+    attn_maps = []
+    prev_attn_masks = None
+    for attn_masks in attentions:
+        # [batch_size, num_groups, height x width] -> [batch_size, height x width, num_groups]
+        attn_masks = tf.transpose(attn_masks, perm=(0, 2, 1))
+        if prev_attn_masks is None:
+            prev_attn_masks = attn_masks
+        else:
+            prev_attn_masks = tf.matmul(prev_attn_masks, attn_masks)
+        # [batch_size, height x width, num_groups] -> [batch_size, num_groups, height x width] -> [batch_size, num_groups, height, width]
+        cur_attn_map = resize_attention_map(tf.transpose(prev_attn_masks, perm=(0, 2, 1)), *hw_shape)
+        attn_maps.append(cur_attn_map)
+
+    # [batch_size, num_groups, height, width]
+    final_grouping = attn_maps[-1]
+
+    return tf.stop_gradient(final_grouping)
+
+
+@dataclass
+class TFGroupViTModelOutput(ModelOutput):
+    """
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image (`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text (`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        segmentation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
+            Classification scores for each pixel.
+
+            <Tip warning={true}>
+
+            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
+            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
+            original image size as post-processing. You should always check your logits shape and resize as needed.
+
+            </Tip>
+
+        text_embeds (`tf.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of
+            [`TFGroupViTTextModel`].
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of
+            [`TFGroupViTVisionModel`].
+        text_model_output (`TFBaseModelOutputWithPooling`):
+            The output of the [`TFGroupViTTextModel`].
+        vision_model_output (`TFBaseModelOutputWithPooling`):
+            The output of the [`TFGroupViTVisionModel`].
+    """
+
+    loss: tf.Tensor | None = None
+    logits_per_image: tf.Tensor | None = None
+    logits_per_text: tf.Tensor | None = None
+    segmentation_logits: tf.Tensor | None = None
+    text_embeds: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
+    text_model_output: TFBaseModelOutputWithPooling = None
+    vision_model_output: TFBaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class TFGroupViTCrossAttentionLayer(keras.layers.Layer):
+    def __init__(self, config: GroupViTVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.attn = TFGroupViTAttention(config, name="attn")
+        self.norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2")
+        self.mlp = TFGroupViTMLP(config, name="mlp")
+        self.norm_post = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post")
+        self.config = config
+
+    def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.Tensor:
+        x = query
+        x = x + self.attn(query, encoder_hidden_states=key)[0]
+        x = x + self.mlp(self.norm2(x))
+        x = self.norm_post(x)
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attn", None) is not None:
+            with tf.name_scope(self.attn.name):
+                self.attn.build(None)
+        if getattr(self, "norm2", None) is not None:
+            with tf.name_scope(self.norm2.name):
+                self.norm2.build([None, None, self.config.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "norm_post", None) is not None:
+            with tf.name_scope(self.norm_post.name):
+                self.norm_post.build([None, None, self.config.hidden_size])
+
+
+class TFGroupViTAssignAttention(keras.layers.Layer):
+    def __init__(self, config: GroupViTVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.scale = config.hidden_size**-0.5
+
+        self.q_proj = keras.layers.Dense(config.hidden_size, name="q_proj")
+        self.k_proj = keras.layers.Dense(config.hidden_size, name="k_proj")
+        self.v_proj = keras.layers.Dense(config.hidden_size, name="v_proj")
+        self.proj = keras.layers.Dense(config.hidden_size, name="proj")
+        self.assign_eps = config.assign_eps
+        self.config = config
+
+    def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor:
+        if gumbel and training:
+            attn = gumbel_softmax(attn, dim=-2, hard=hard)
+        else:
+            if hard:
+                attn = hard_softmax(attn, dim=-2)
+            else:
+                attn = stable_softmax(attn, axis=-2)
+
+        return attn
+
+    def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False):
+        value = key
+        # [batch_size, query_length, channels]
+        query = self.q_proj(query)
+
+        # [batch_size, key_length, channels]
+        key = self.k_proj(key)
+
+        # [batch_size, key_length, channels]
+        value = self.v_proj(value)
+
+        # [batch_size, query_length, key_length]
+        raw_attn = tf.matmul(query, key, transpose_b=True) * self.scale
+
+        attn = self.get_attn(raw_attn, training=training)
+        soft_attn = self.get_attn(raw_attn, training=training, gumbel=False, hard=False)
+
+        attn = attn / (tf.math.reduce_sum(attn, axis=-1, keepdims=True) + self.assign_eps)
+
+        out = tf.matmul(attn, value)
+
+        out = self.proj(out)
+
+        return out, soft_attn
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.config.hidden_size])
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.config.hidden_size])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.config.hidden_size])
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, self.config.hidden_size])
+
+
+class TFGroupViTTokenAssign(keras.layers.Layer):
+    def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_output_group: int, **kwargs):
+        super().__init__(**kwargs)
+        self.num_output_group = num_output_group
+        # norm on group_tokens
+        self.norm_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_tokens")
+        assign_mlp_ratio = (
+            config.assign_mlp_ratio
+            if isinstance(config.assign_mlp_ratio, collections.abc.Iterable)
+            else (config.assign_mlp_ratio, config.assign_mlp_ratio)
+        )
+        tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio]
+        self.mlp_inter = TFGroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group, name="mlp_inter")
+        self.norm_post_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post_tokens")
+        # norm on x
+        self.norm_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_x")
+        self.pre_assign_attn = TFGroupViTCrossAttentionLayer(config, name="pre_assign_attn")
+
+        self.assign = TFGroupViTAssignAttention(config, name="assign")
+        self.norm_new_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_new_x")
+        self.mlp_channels = TFGroupViTMLP(
+            config, config.hidden_size, channels_dim, config.hidden_size, name="mlp_channels"
+        )
+        self.config = config
+
+    def project_group_token(self, group_tokens: tf.Tensor) -> tf.Tensor:
+        """
+        Args:
+            group_tokens (tf.Tensor): group tokens, [batch_size, num_group_tokens, channels]
+
+        Returns:
+            projected_group_tokens (tf.Tensor): [batch_size, num_output_groups, channels]
+        """
+        # [B, num_output_groups, C] <- [B, num_group_tokens, C]
+        projected_group_tokens = self.mlp_inter(group_tokens)
+        projected_group_tokens = self.norm_post_tokens(projected_group_tokens)
+        return projected_group_tokens
+
+    def call(self, image_tokens: tf.Tensor, group_tokens: tf.Tensor, training: bool = False):
+        """
+        Args:
+            image_tokens (`tf.Tensor`): image tokens, of shape [batch_size, input_length, channels]
+            group_tokens (`tf.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
+        """
+
+        group_tokens = self.norm_tokens(group_tokens)
+        image_tokens = self.norm_x(image_tokens)
+        # [batch_size, num_output_groups, channels]
+        projected_group_tokens = self.project_group_token(group_tokens)
+        projected_group_tokens = self.pre_assign_attn(projected_group_tokens, image_tokens)
+        new_image_tokens, attention = self.assign(projected_group_tokens, image_tokens)
+        new_image_tokens += projected_group_tokens
+
+        new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens))
+
+        return new_image_tokens, attention
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "norm_tokens", None) is not None:
+            with tf.name_scope(self.norm_tokens.name):
+                self.norm_tokens.build([None, None, self.config.hidden_size])
+        if getattr(self, "mlp_inter", None) is not None:
+            with tf.name_scope(self.mlp_inter.name):
+                self.mlp_inter.build(None)
+        if getattr(self, "norm_post_tokens", None) is not None:
+            with tf.name_scope(self.norm_post_tokens.name):
+                self.norm_post_tokens.build([None, None, self.config.hidden_size])
+        if getattr(self, "norm_x", None) is not None:
+            with tf.name_scope(self.norm_x.name):
+                self.norm_x.build([None, None, self.config.hidden_size])
+        if getattr(self, "pre_assign_attn", None) is not None:
+            with tf.name_scope(self.pre_assign_attn.name):
+                self.pre_assign_attn.build(None)
+        if getattr(self, "assign", None) is not None:
+            with tf.name_scope(self.assign.name):
+                self.assign.build(None)
+        if getattr(self, "norm_new_x", None) is not None:
+            with tf.name_scope(self.norm_new_x.name):
+                self.norm_new_x.build([None, None, self.config.hidden_size])
+        if getattr(self, "mlp_channels", None) is not None:
+            with tf.name_scope(self.mlp_channels.name):
+                self.mlp_channels.build(None)
+
+
+# Adapted from transformers.models.vit.modeling_tf_vit.TFViTPatchEmbeddings with ViT->GroupViT
+class TFGroupViTPatchEmbeddings(keras.layers.Layer):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config: GroupViTConfig, **kwargs):
+        super().__init__(**kwargs)
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels = config.num_channels
+        # hidden_size is a member as it will be required in the call method
+        self.hidden_size = config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.num_channels = num_channels
+        self.config = config
+
+        self.projection = keras.layers.Conv2D(
+            filters=self.hidden_size,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="valid",
+            data_format="channels_last",
+            use_bias=True,
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            bias_initializer="zeros",
+            name="projection",
+        )
+
+    def call(
+        self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
+    ) -> tf.Tensor:
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        if tf.executing_eagerly() and num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if (
+            not interpolate_pos_encoding
+            and tf.executing_eagerly()
+            and (height != self.image_size[0] or width != self.image_size[1])
+        ):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+
+        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        projection = self.projection(pixel_values)
+
+        # Change the 2D spatial dimensions to a single temporal dimension.
+        # shape = (batch_size, num_patches, out_channels=embed_dim)
+        num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])
+        # In the TFGroupViTVisionEmbeddings the embeddings from this layer will be layer normalized
+        # LayerNormalization layer needs to have static last dimension (otherwise the test_keras_save_load fails with symbolic tensors)
+        # This is why we have used the hidden_size in the reshape method
+        embeddings = tf.reshape(tensor=projection, shape=(batch_size, num_patches, self.hidden_size))
+
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+
+
+# Adapted from transformers.vit.modeling_tf_vit.TFViTEmbeddings
+class TFGroupViTVisionEmbeddings(keras.layers.Layer):
+    """
+    Construct the position and patch embeddings.
+
+    """
+
+    def __init__(self, config: GroupViTVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.patch_embeddings = TFGroupViTPatchEmbeddings(config, name="patch_embeddings")
+        self.dropout = keras.layers.Dropout(rate=config.dropout, name="dropout")
+        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        self.config = config
+
+    def build(self, input_shape=None):
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = self.add_weight(
+            shape=(1, num_patches, self.config.hidden_size),
+            initializer="zeros",
+            trainable=True,
+            name="position_embeddings",
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embeddings", None) is not None:
+            with tf.name_scope(self.patch_embeddings.name):
+                self.patch_embeddings.build(None)
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.hidden_size])
+
+    def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        batch_size, num_patches, dim = shape_list(embeddings)
+        num_positions = shape_list(self.position_embeddings)[1]
+
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        patch_pos_embed = self.position_embeddings
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        patch_pos_embed = tf.image.resize(
+            images=tf.reshape(
+                patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+            ),
+            size=(h0, w0),
+            method="bicubic",
+        )
+        patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
+        return patch_pos_embed
+
+    def call(
+        self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
+    ) -> tf.Tensor:
+        _, _, height, width = shape_list(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        embeddings = self.layernorm(embeddings)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->GroupViT
+class TFGroupViTTextEmbeddings(keras.layers.Layer):
+    def __init__(self, config: GroupViTTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape = None):
+        with tf.name_scope("token_embedding"):
+            self.weight = self.add_weight(
+                shape=(self.config.vocab_size, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="weight",
+            )
+
+        with tf.name_scope("position_embedding"):
+            self.position_embedding = self.add_weight(
+                shape=(self.config.max_position_embeddings, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="embeddings",
+            )
+
+        super().build(input_shape)
+
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+        position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
+        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
+        final_embeddings = inputs_embeds + position_embeds
+
+        return final_embeddings
+
+
+class TFGroupViTStage(keras.layers.Layer):
+    """This corresponds to the `GroupingLayer` class in the GroupViT implementation."""
+
+    def __init__(
+        self,
+        config: GroupViTVisionConfig,
+        depth: int,
+        num_prev_group_token: int,
+        num_group_token: int,
+        num_output_group: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.config = config
+        self.depth = depth
+        self.num_group_token = num_group_token
+        self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(depth)]
+
+        if num_group_token > 0:
+            self.downsample = TFGroupViTTokenAssign(
+                config=config,
+                num_group_token=num_group_token,
+                num_output_group=num_output_group,
+                name="downsample",
+            )
+        else:
+            self.downsample = None
+
+        if num_prev_group_token > 0 and num_group_token > 0:
+            self.group_projector = [
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="group_projector.0"),
+                TFGroupViTMixerMLP(
+                    config, num_prev_group_token, config.hidden_size // 2, num_group_token, name="group_projector.1"
+                ),
+            ]
+        else:
+            self.group_projector = None
+
+    def build(self, input_shape=None):
+        if self.num_group_token > 0:
+            self.group_token = self.add_weight(
+                shape=(1, self.num_group_token, self.config.hidden_size),
+                initializer="zeros",
+                trainable=True,
+                name="group_token",
+            )
+        else:
+            self.group_token = None
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "downsample", None) is not None:
+            with tf.name_scope(self.downsample.name):
+                self.downsample.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+        if getattr(self, "group_projector", None) is not None:
+            with tf.name_scope(self.group_projector[0].name):
+                self.group_projector[0].build([None, None, self.config.hidden_size])
+            with tf.name_scope(self.group_projector[1].name):
+                self.group_projector[1].build(None)
+
+    @property
+    def with_group_token(self):
+        return self.group_token is not None
+
+    def split_x(self, x: tf.Tensor) -> tf.Tensor:
+        if self.with_group_token:
+            return x[:, : -self.num_group_token], x[:, -self.num_group_token :]
+        else:
+            return x, None
+
+    def concat_x(self, x: tf.Tensor, group_token: tf.Tensor | None = None) -> tf.Tensor:
+        if group_token is None:
+            return x
+        return tf.concat([x, group_token], axis=1)
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        prev_group_token: tf.Tensor | None = None,
+        output_attentions: bool = False,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the grouping tensors of Grouping block.
+        """
+        if self.with_group_token:
+            group_token = tf.tile(self.group_token, multiples=(shape_list(hidden_states)[0], 1, 1))
+            if self.group_projector is not None:
+                for layer in self.group_projector:
+                    prev_group_token = layer(prev_group_token)
+                group_token = group_token + prev_group_token
+        else:
+            group_token = None
+
+        x = hidden_states
+
+        cat_x = self.concat_x(x, group_token)
+        for layer in self.layers:
+            layer_out = layer(
+                cat_x,
+                attention_mask=None,
+                causal_attention_mask=None,
+                output_attentions=None,
+            )
+            cat_x = layer_out[0]
+
+        x, group_token = self.split_x(cat_x)
+
+        attention = None
+        if self.downsample is not None:
+            x, attention = self.downsample(x, group_token)
+
+        outputs = (x, group_token)
+        if output_attentions:
+            outputs = outputs + (attention,)
+
+        return outputs
+
+
+class TFGroupViTMLP(keras.layers.Layer):
+    def __init__(
+        self,
+        config: GroupViTVisionConfig,
+        hidden_size: int | None = None,
+        intermediate_size: int | None = None,
+        output_size: int | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.config = config
+        self.activation_fn = get_tf_activation(config.hidden_act)
+        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+        intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        output_size = output_size if output_size is not None else hidden_size
+        self.fc1 = keras.layers.Dense(intermediate_size, name="fc1")
+        self.fc2 = keras.layers.Dense(output_size, name="fc2")
+        self.intermediate_size = intermediate_size
+        self.hidden_size = hidden_size
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.hidden_size])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.intermediate_size])
+
+
+class TFGroupViTMixerMLP(TFGroupViTMLP):
+    def call(self, x, training: bool = False):
+        x = super().call(hidden_states=tf.transpose(x, perm=(0, 2, 1)))
+        return tf.transpose(x, perm=(0, 2, 1))
+
+
+# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPAttention
+class TFGroupViTAttention(keras.layers.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GroupViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = self.embed_dim // self.num_attention_heads
+        if self.attention_head_size * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_attention_heads})."
+            )
+
+        factor = config.initializer_factor
+        in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
+        out_proj_std = (self.embed_dim**-0.5) * factor
+
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.q_proj = keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
+        )
+        self.k_proj = keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
+        )
+        self.v_proj = keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
+        )
+
+        self.dropout = keras.layers.Dropout(rate=config.attention_dropout)
+
+        self.out_proj = keras.layers.Dense(
+            units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
+        )
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        causal_attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size = shape_list(hidden_states)[0]
+        is_cross_attention = encoder_hidden_states is not None
+
+        mixed_query_layer = self.q_proj(inputs=hidden_states)
+        if is_cross_attention:
+            mixed_key_layer = self.k_proj(inputs=encoder_hidden_states)
+            mixed_value_layer = self.v_proj(inputs=encoder_hidden_states)
+        else:
+            mixed_key_layer = self.k_proj(inputs=hidden_states)
+            mixed_value_layer = self.v_proj(inputs=hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            # Apply the causal attention mask (precomputed for all layers in TFCLIPModel call() function)
+            attention_scores = tf.add(attention_scores, causal_attention_mask)
+
+        if attention_mask is not None:
+            # Apply the attention mask (precomputed for all layers in TFCLIPModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        _attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=_attention_probs)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, embed_dim)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))
+
+        attention_output = self.out_proj(attention_output)
+        # In TFBert, attention weights are returned after dropout.
+        # However, in CLIP, they are returned before dropout.
+        outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPEncoderLayer with CLIP->GroupViT
+class TFGroupViTEncoderLayer(keras.layers.Layer):
+    def __init__(self, config: GroupViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = TFGroupViTAttention(config, name="self_attn")
+        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+        self.mlp = TFGroupViTMLP(config, name="mlp")
+        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        causal_attention_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            causal_attention_mask (`tf.Tensor`): causal attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`):
+                Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
+                tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(inputs=hidden_states)
+        attention_outputs = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        hidden_states = attention_outputs[0]
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(inputs=hidden_states)
+        hidden_states = self.mlp(hidden_states=hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,) + attention_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "layer_norm1", None) is not None:
+            with tf.name_scope(self.layer_norm1.name):
+                self.layer_norm1.build([None, None, self.embed_dim])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "layer_norm2", None) is not None:
+            with tf.name_scope(self.layer_norm2.name):
+                self.layer_norm2.build([None, None, self.embed_dim])
+
+
+# Adapted from transformers.models.clip.modeling_tf_clip.TFGroupViTTextEncoder
+class TFGroupViTTextEncoder(keras.layers.Layer):
+    def __init__(self, config: GroupViTTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask: tf.Tensor,
+        causal_attention_mask: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> tuple | TFBaseModelOutput:
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFGroupViTVisionEncoder(keras.layers.Layer):
+    def __init__(self, config: GroupViTVisionConfig, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.stages = [
+            TFGroupViTStage(
+                config=config,
+                depth=config.depths[i],
+                num_group_token=config.num_group_tokens[i],
+                num_output_group=config.num_output_groups[i],
+                num_prev_group_token=config.num_output_groups[i - 1] if i > 0 else 0,
+                name=f"stages_._{i}",
+            )
+            for i in range(len(config.depths))
+        ]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_hidden_states: bool,
+        output_attentions: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> tuple | TFBaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_groupings = () if output_attentions else None
+
+        group_tokens = None
+
+        for stage in self.stages:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = stage(hidden_states, group_tokens, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            group_tokens = layer_outputs[1]
+
+            if output_attentions and layer_outputs[2] is not None:
+                all_groupings = all_groupings + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_groupings] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "stages", None) is not None:
+            for layer in self.stages:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder
+class TFGroupViTTextTransformer(keras.layers.Layer):
+    def __init__(self, config: GroupViTTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embeddings = TFGroupViTTextEmbeddings(config, name="embeddings")
+        self.encoder = TFGroupViTTextEncoder(config, name="encoder")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+        self.embed_dim = config.hidden_size
+
+    def call(
+        self,
+        input_ids: TFModelInputType,
+        attention_mask: tf.Tensor,
+        position_ids: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        input_shape = shape_list(input_ids)
+
+        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        batch_size, seq_length = input_shape
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)
+
+        # check attention mask and invert
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        attention_mask = _expand_mask(attention_mask)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.final_layer_norm(inputs=sequence_output)
+
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
+                ),
+            )
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(
+                        tf.range(input_shape[0], dtype=tf.int64),
+                        tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
+                    ),
+                    axis=1,
+                ),
+            )
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
+        # It is possible with an unspecified sequence length for seq_length to be
+        # a runtime value, which is unsupported by tf.constant. Per the TensorFlow
+        # docs, tf.fill can handle runtime dynamic shapes:
+        # https://www.tensorflow.org/api_docs/python/tf/fill
+        diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)
+
+        # set an additive 2D attention mask with all places being masked
+        to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)
+
+        # set diagonal & lower triangular parts to 0 (i.e. the places not to be masked)
+        # TIP: think the 2D matrix as the space of (query_seq, key_seq)
+        to_mask = tf.linalg.band_part(to_mask, 0, -1)
+        # to_mask = tf.linalg.band_part(to_mask, -1, 0)
+        to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)
+
+        return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+
+
+# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPVisionTransformer
+class TFGroupViTVisionTransformer(keras.layers.Layer):
+    def __init__(self, config: GroupViTVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embeddings = TFGroupViTVisionEmbeddings(config, name="embeddings")
+        self.encoder = TFGroupViTVisionEncoder(config, name="encoder")
+        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        self.embed_dim = config.hidden_size
+
+    def call(
+        self,
+        pixel_values: TFModelInputType,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> tuple | TFBaseModelOutputWithPooling:
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        # normalize the last hidden state
+        last_hidden_state = self.layernorm(last_hidden_state)
+        pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.embed_dim])
+
+
+@keras_serializable
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextMainLayer with CLIP->GroupViT
+class TFGroupViTTextMainLayer(keras.layers.Layer):
+    config_class = GroupViTTextConfig
+
+    def __init__(self, config: GroupViTTextConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.text_model = TFGroupViTTextTransformer(config, name="text_model")
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.text_model.embeddings
+
+    def set_input_embeddings(self, value: tf.Variable):
+        self.text_model.embeddings.weight = value
+        self.text_model.embeddings.vocab_size = shape_list(value)[0]
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = shape_list(input_ids)
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=input_shape, value=1)
+
+        text_model_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return text_model_outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "text_model", None) is not None:
+            with tf.name_scope(self.text_model.name):
+                self.text_model.build(None)
+
+
+@keras_serializable
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPVisionMainLayer with CLIP->GroupViT
+class TFGroupViTVisionMainLayer(keras.layers.Layer):
+    config_class = GroupViTVisionConfig
+
+    def __init__(self, config: GroupViTVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.vision_model = TFGroupViTVisionTransformer(config, name="vision_model")
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.vision_model.embeddings
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        vision_model_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return vision_model_outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vision_model", None) is not None:
+            with tf.name_scope(self.vision_model.name):
+                self.vision_model.build(None)
+
+
+@keras_serializable
+# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPMainLayer
+class TFGroupViTMainLayer(keras.layers.Layer):
+    config_class = GroupViTConfig
+
+    def __init__(self, config: GroupViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if not isinstance(config.text_config, GroupViTTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type GroupViTTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, GroupViTVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type GroupViTVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        self.config = config
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.projection_intermediate_dim = config.projection_intermediate_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = TFGroupViTTextTransformer(text_config, name="text_model")
+        self.vision_model = TFGroupViTVisionTransformer(vision_config, name="vision_model")
+
+        self.visual_projection = [
+            keras.layers.Dense(self.projection_intermediate_dim, name="visual_projection.0"),
+            keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.9, epsilon=1e-5),
+            keras.layers.ReLU(name="visual_projection.2"),
+            keras.layers.Dense(self.projection_dim, name="visual_projection.3"),
+        ]
+        self.text_projection = [
+            keras.layers.Dense(self.projection_intermediate_dim, name="text_projection.0"),
+            keras.layers.BatchNormalization(name="text_projection.1", momentum=0.9, epsilon=1e-5),
+            keras.layers.ReLU(name="text_projection.2"),
+            keras.layers.Dense(self.projection_dim, name="text_projection.3"),
+        ]
+
+    def build(self, input_shape=None):
+        self.logit_scale = self.add_weight(
+            shape=(1,),
+            initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
+            trainable=True,
+            name="logit_scale",
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "text_model", None) is not None:
+            with tf.name_scope(self.text_model.name):
+                self.text_model.build(None)
+        if getattr(self, "vision_model", None) is not None:
+            with tf.name_scope(self.vision_model.name):
+                self.vision_model.build(None)
+        if getattr(self, "visual_projection", None) is not None:
+            with tf.name_scope(self.visual_projection[0].name):
+                self.visual_projection[0].build([None, None, None, self.vision_embed_dim])
+            with tf.name_scope(self.visual_projection[1].name):
+                self.visual_projection[1].build((None, self.projection_intermediate_dim))
+            with tf.name_scope(self.visual_projection[3].name):
+                self.visual_projection[3].build([None, None, None, self.projection_intermediate_dim])
+        if getattr(self, "text_projection", None) is not None:
+            with tf.name_scope(self.text_projection[0].name):
+                self.text_projection[0].build([None, None, None, self.text_embed_dim])
+            with tf.name_scope(self.text_projection[1].name):
+                self.text_projection[1].build((None, self.projection_intermediate_dim))
+            with tf.name_scope(self.text_projection[3].name):
+                self.text_projection[3].build([None, None, None, self.projection_intermediate_dim])
+
+    @unpack_inputs
+    def get_text_features(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tf.Tensor:
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = shape_list(input_ids)
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=input_shape, value=1)
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        pooled_output = text_outputs[1]
+        for layer in self.text_projection:
+            pooled_output = layer(pooled_output)
+
+        text_features = pooled_output
+        return text_features
+
+    @unpack_inputs
+    def get_image_features(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tf.Tensor:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        pooled_output = vision_outputs[1]
+        for layer in self.visual_projection:
+            pooled_output = layer(pooled_output)
+
+        image_features = pooled_output
+        return image_features
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        return_loss: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        output_segmentation: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFGroupViTModelOutput | tuple[tf.Tensor]:
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        input_shape = shape_list(input_ids)
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=input_shape, value=1)
+        if output_segmentation:
+            output_attentions = True
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[1]
+        for layer in self.visual_projection:
+            image_embeds = layer(image_embeds)
+
+        text_embeds = text_outputs[1]
+        for layer in self.text_projection:
+            text_embeds = layer(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / tf.norm(image_embeds, axis=-1, keepdims=True)
+        text_embeds = text_embeds / tf.norm(text_embeds, axis=-1, keepdims=True)
+
+        # cosine similarity as logits
+        logit_scale = tf.math.exp(self.logit_scale)
+        logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
+        logits_per_image = tf.transpose(logits_per_text)
+
+        seg_logits = None
+        if output_segmentation:
+            # grouped features
+            # [batch_size_image, num_group, hidden_size]
+            image_group_embeds = vision_outputs[0]
+            # [batch_size_image*num_group, hidden_size]
+            image_group_embeds = tf.reshape(image_group_embeds, shape=(-1, shape_list(image_group_embeds)[-1]))
+            for layer in self.visual_projection:
+                image_group_embeds = layer(image_group_embeds)
+            if output_hidden_states:
+                attentions = vision_outputs[3]
+            else:
+                attentions = vision_outputs[2]
+            # [batch_size_image, num_group, height, width]
+            grouping = get_grouping_from_attentions(attentions, pixel_values.shape[2:])
+
+            # normalized features
+            image_group_embeds = image_group_embeds / tf.norm(
+                tensor=image_group_embeds, ord="euclidean", axis=-1, keepdims=True
+            )
+            # [batch_size_image x num_group, batch_size_text]
+            logits_per_image_group = tf.matmul(image_group_embeds, text_embeds, transpose_b=True) * logit_scale
+            # [batch_size_image, batch_size_text, num_group]
+            logits_per_image_group = tf.reshape(
+                logits_per_image_group, shape=(image_embeds.shape[0], -1, text_embeds.shape[0])
+            )
+            logits_per_image_group = tf.transpose(logits_per_image_group, perm=(0, 2, 1))
+
+            # [batch_size_image, batch_size_text, height x width]
+            flatten_grouping = tf.reshape(grouping, shape=(shape_list(grouping)[0], shape_list(grouping)[1], -1))
+
+            # [batch_size_image, batch_size_text, height, width]
+            seg_logits = tf.matmul(logits_per_image_group, flatten_grouping) * logit_scale
+            seg_logits = tf.reshape(
+                seg_logits, shape=(seg_logits.shape[0], seg_logits.shape[1], grouping.shape[2], grouping.shape[3])
+            )
+
+        loss = None
+        if return_loss:
+            loss = groupvit_loss(logits_per_text)[None, ...]
+
+        if not return_dict:
+            if seg_logits is not None:
+                output = (
+                    logits_per_image,
+                    logits_per_text,
+                    seg_logits,
+                    text_embeds,
+                    image_embeds,
+                    text_outputs,
+                    vision_outputs,
+                )
+            else:
+                output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return TFGroupViTModelOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            segmentation_logits=seg_logits,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class TFGroupViTPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GroupViTConfig
+    base_model_prefix = "groupvit"
+
+
+GROUPVIT_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TF 2.0 models accepts two formats as inputs:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional arguments.
+
+    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
+    tensors in the first argument of the model call function: `model(inputs)`.
+
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
+    first positional argument :
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+      `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+      `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    </Tip>
+
+    Args:
+        config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]`, `dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+GROUPVIT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` `dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`CLIPImageProcessor.__call__`] for details.
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+class TFGroupViTTextModel(TFGroupViTPreTrainedModel):
+    config_class = GroupViTTextConfig
+    main_input_name = "input_ids"
+
+    def __init__(self, config: GroupViTTextConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.groupvit = TFGroupViTTextMainLayer(config, name="groupvit")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTTextConfig)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, TFGroupViTTextModel
+
+        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> model = TFGroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+
+        outputs = self.groupvit(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "groupvit", None) is not None:
+            with tf.name_scope(self.groupvit.name):
+                self.groupvit.build(None)
+
+
+class TFGroupViTVisionModel(TFGroupViTPreTrainedModel):
+    config_class = GroupViTVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: GroupViTVisionConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.groupvit = TFGroupViTVisionMainLayer(config, name="groupvit")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFGroupViTVisionModel
+
+        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> model = TFGroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+
+        outputs = self.groupvit(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "groupvit", None) is not None:
+            with tf.name_scope(self.groupvit.name):
+                self.groupvit.build(None)
+
+
+@add_start_docstrings(GROUPVIT_START_DOCSTRING)
+class TFGroupViTModel(TFGroupViTPreTrainedModel):
+    config_class = GroupViTConfig
+
+    def __init__(self, config: GroupViTConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.groupvit = TFGroupViTMainLayer(config, name="groupvit")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def get_text_features(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
+            the projection layer to the pooled output of [`TFGroupViTTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import CLIPTokenizer, TFGroupViTModel
+
+        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+
+        text_features = self.groupvit.get_text_features(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return text_features
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
+            the projection layer to the pooled output of [`TFGroupViTVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFGroupViTModel
+
+        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+
+        image_features = self.groupvit.get_image_features(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return image_features
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GROUPVIT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFGroupViTModelOutput, config_class=GroupViTConfig)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        return_loss: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        output_segmentation: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFGroupViTModelOutput | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFGroupViTModel
+        >>> import tensorflow as tf
+
+        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = tf.math.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+        ```"""
+
+        outputs = self.groupvit(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_loss=return_loss,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_segmentation=output_segmentation,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def serving_output(self, output: TFGroupViTModelOutput) -> TFGroupViTModelOutput:
+        # TODO: As is this currently fails with saved_model=True, because
+        # TensorFlow cannot trace through nested dataclasses. Reference:
+        # https://github.com/huggingface/transformers/pull/16886
+        return output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "groupvit", None) is not None:
+            with tf.name_scope(self.groupvit.name):
+                self.groupvit.build(None)
+
+
+__all__ = ["TFGroupViTModel", "TFGroupViTPreTrainedModel", "TFGroupViTTextModel", "TFGroupViTVisionModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/__init__.py b/phivenv/Lib/site-packages/transformers/models/helium/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d0966e5c1c489944c4b539c0cc06384c985c87
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/helium/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_helium import *
+    from .modeling_helium import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e21a4e68eb57cc10529a6baeb3eda30c53098d0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/configuration_helium.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/configuration_helium.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d55a801e7f71b3867e414246ad8a6a17aab5c8f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/configuration_helium.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/modeling_helium.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/modeling_helium.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..984ddf5c56644129fa867e96eda4a320d9a796e0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/modeling_helium.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/modular_helium.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/modular_helium.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d35b836bdc3c7e937a6c2f7223d04e6b0692f4a1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/helium/__pycache__/modular_helium.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/configuration_helium.py b/phivenv/Lib/site-packages/transformers/models/helium/configuration_helium.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bb4d8d88750bc5e7becd8cd20a2ef8db1d5936b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/helium/configuration_helium.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2024 The Kyutai and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class HeliumConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HeliumModel`]. It is used to instantiate an Helium
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Helium 2b model.
+    e.g. [kyutai/helium-2b](https://huggingface.co/kyutai/helium-2b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 48000):
+            Vocabulary size of the Helium model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HeliumModel`]
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 7040):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 20):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-08):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 100000.0):
+            The base period of the RoPE embeddings.
+        pad_token_id (`int`, *optional*, defaults to 3):
+            Padding token id.
+        eos_token_id (`int` | `list`, *optional*, defaults to 2):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+    ```python
+    >>> from transformers import HeliumModel, HeliumConfig
+    >>> # Initializing a Helium 2b style configuration
+    >>> configuration = HeliumConfig()
+    >>> # Initializing a model from the Helium 2b style configuration
+    >>> model = HeliumModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "helium"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=48000,
+        hidden_size=2560,
+        intermediate_size=7040,
+        num_hidden_layers=24,
+        num_attention_heads=20,
+        num_key_value_heads=20,
+        head_dim=128,
+        hidden_act="silu",
+        attention_dropout=0.0,
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-8,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=100000.0,
+        pad_token_id=3,
+        eos_token_id=2,
+        bos_token_id=1,
+        attention_bias=False,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["HeliumConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/modeling_helium.py b/phivenv/Lib/site-packages/transformers/models/helium/modeling_helium.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f4a2e73affd76f079ecfafb2b51ba88e0ccf252
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/helium/modeling_helium.py
@@ -0,0 +1,497 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/helium/modular_helium.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_helium.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 The Kyutai and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_helium import HeliumConfig
+
+
+class HeliumRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class HeliumRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: HeliumConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class HeliumMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed, k_embed
+
+
+class HeliumAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = 1 / math.sqrt(self.head_dim)
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HeliumDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = HeliumAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = HeliumMLP(config)
+        self.input_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class HeliumPreTrainedModel(PreTrainedModel):
+    config: HeliumConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HeliumDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": HeliumDecoderLayer,
+        "attentions": HeliumAttention,
+    }
+
+
+@auto_docstring
+class HeliumModel(HeliumPreTrainedModel):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HeliumDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HeliumRotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HeliumModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, HeliumForCausalLM
+
+        >>> model = HeliumForCausalLM.from_pretrained("google/helium-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/helium-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class HeliumForSequenceClassification(GenericForSequenceClassification, HeliumPreTrainedModel):
+    pass
+
+
+class HeliumForTokenClassification(GenericForTokenClassification, HeliumPreTrainedModel):
+    pass
+
+
+__all__ = [
+    "HeliumPreTrainedModel",
+    "HeliumModel",
+    "HeliumForCausalLM",
+    "HeliumForSequenceClassification",
+    "HeliumForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/helium/modular_helium.py b/phivenv/Lib/site-packages/transformers/models/helium/modular_helium.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe53f7820abb1e36030bbab4e73e2f78983cda0e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/helium/modular_helium.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2024 The Kyutai and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...utils import logging
+from ..gemma.modeling_gemma import GemmaForCausalLM, GemmaForSequenceClassification, GemmaForTokenClassification
+from ..granite.modeling_granite import GraniteAttention
+from ..llama.modeling_llama import LlamaDecoderLayer, LlamaMLP, LlamaModel, LlamaPreTrainedModel, LlamaRotaryEmbedding
+from .configuration_helium import HeliumConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class HeliumRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class HeliumRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class HeliumMLP(LlamaMLP):
+    pass
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed, k_embed
+
+
+class HeliumAttention(GraniteAttention):
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.scaling = 1 / math.sqrt(self.head_dim)
+
+
+class HeliumDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+
+        self.mlp = HeliumMLP(config)
+        self.input_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class HeliumPreTrainedModel(LlamaPreTrainedModel):
+    pass
+
+
+class HeliumModel(HeliumPreTrainedModel, LlamaModel):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [HeliumDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HeliumRotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class HeliumForCausalLM(GemmaForCausalLM):
+    pass
+
+
+class HeliumForSequenceClassification(GemmaForSequenceClassification):
+    pass
+
+
+class HeliumForTokenClassification(GemmaForTokenClassification):
+    pass
+
+
+__all__ = [
+    "HeliumPreTrainedModel",
+    "HeliumModel",
+    "HeliumForCausalLM",
+    "HeliumForSequenceClassification",
+    "HeliumForTokenClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/herbert/__init__.py b/phivenv/Lib/site-packages/transformers/models/herbert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0d0794a06e8cfbe4178a72e2b09d5292ddbc4fb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/herbert/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_herbert import *
+    from .tokenization_herbert_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..980c3868755b7b4d658444be5d3fd3e45da82e4f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c82d2ae3b10b0f663e0d5e13ec8649f69dbc1e4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..266974c6ed409e1f0888f7b67b8cafe6f808f429
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/herbert/tokenization_herbert.py b/phivenv/Lib/site-packages/transformers/models/herbert/tokenization_herbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c6bacc87fc68bf5a4d998236e878fdf602a8f0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/herbert/tokenization_herbert.py
@@ -0,0 +1,617 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import re
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct
+def replace_unicode_punct(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", "1")
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
+    return text
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char
+def remove_non_printing_char(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    """
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith("C"):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class HerbertTokenizer(PreTrainedTokenizer):
+    """
+    Construct a BPE tokenizer for HerBERT.
+
+    Peculiarities:
+
+    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
+      punctuation character will be treated separately.
+
+    - Such pretokenized input is BPE subtokenized
+
+    This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
+    superclass for more information regarding methods.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        tokenizer_file=None,
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        sep_token="</s>",
+        bos_token="<s>",
+        do_lowercase_and_remove_accent=False,
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        **kwargs,
+    ):
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use HerbertTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = {}
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
+        # True for current supported model (v1.2.0), False for XLM-17 & 100
+        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
+            tokenizer_file=None,
+            **kwargs,
+        )
+
+        self.bert_pre_tokenizer = BasicTokenizer(
+            do_lower_case=False,
+            never_split=self.all_special_tokens,
+            tokenize_chinese_chars=False,
+            strip_accents=False,
+        )
+
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
+    def do_lower_case(self):
+        return self.do_lowercase_and_remove_accent
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize
+    def ja_tokenize(self, text):
+        if self.ja_word_tokenizer is None:
+            try:
+                import Mykytea
+
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
+                )
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
+                    " (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
+                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
+                raise
+        return list(self.ja_word_tokenizer.getWS(text))
+
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.encoder)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        pre_tokens = self.bert_pre_tokenizer.tokenize(text)
+
+        split_tokens = []
+        for token in pre_tokens:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).replace("</w>", " ").strip()
+        return out_string
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+
+        """
+        bos = [self.bos_token_id]
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return bos + token_ids_0 + sep
+        return bos + token_ids_0 + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sm"] = None
+        return state
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+
+__all__ = ["HerbertTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/herbert/tokenization_herbert_fast.py b/phivenv/Lib/site-packages/transformers/models/herbert/tokenization_herbert_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdc24e3c6a6e20a847043379f898fead1beb819f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/herbert/tokenization_herbert_fast.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_herbert import HerbertTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class HerbertTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).
+
+    Peculiarities:
+
+    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
+      a punctuation character will be treated separately.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the
+    superclass for more information regarding methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = HerbertTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        sep_token="</s>",
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sep_token=sep_token,
+            **kwargs,
+        )
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An HerBERT, like BERT sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["HerbertTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__init__.py b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf549dccf1d5b6bcb58f9337d4c97ca41536ce4
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hgnet_v2 import *
+    from .modeling_hgnet_v2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd8bcf7ad8c754203abc1afe4f19434f9ed7c01e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b92b1429931a6bacbbe8ee8ab604475cc84f824
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bb5597350efe0e8cf21cbfd3df93a8827857a89
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa872a8d04adb74d2c244d35d095a740d51c1be4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/configuration_hgnet_v2.py b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/configuration_hgnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ce13dec29f1fa8df01edf611ee4566d812c61f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/configuration_hgnet_v2.py
@@ -0,0 +1,152 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hgnet_v2/modular_hgnet_v2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hgnet_v2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+# TODO: Modular conversion for resnet must be fixed as
+# it provides incorrect import for configuration like resnet_resnet
+class HGNetV2Config(BackboneConfigMixin, PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`HGNetV2Backbone`]. It is used to instantiate a HGNet-V2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of D-FINE-X-COCO B4 "[ustc-community/dfine_x_coco"](https://huggingface.co/ustc-community/dfine_x_coco").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        stem_channels (`list[int]`, *optional*, defaults to `[3, 32, 48]`):
+            Channel dimensions for the stem layers:
+            - First number (3) is input image channels
+            - Second number (32) is intermediate stem channels
+            - Third number (48) is output stem channels
+        stage_in_channels (`list[int]`, *optional*, defaults to `[48, 128, 512, 1024]`):
+            Input channel dimensions for each stage of the backbone.
+            This defines how many channels the input to each stage will have.
+        stage_mid_channels (`list[int]`, *optional*, defaults to `[48, 96, 192, 384]`):
+            Mid-channel dimensions for each stage of the backbone.
+            This defines the number of channels used in the intermediate layers of each stage.
+        stage_out_channels (`list[int]`, *optional*, defaults to `[128, 512, 1024, 2048]`):
+            Output channel dimensions for each stage of the backbone.
+            This defines how many channels the output of each stage will have.
+        stage_num_blocks (`list[int]`, *optional*, defaults to `[1, 1, 3, 1]`):
+            Number of blocks to be used in each stage of the backbone.
+            This controls the depth of each stage by specifying how many convolutional blocks to stack.
+        stage_downsample (`list[bool]`, *optional*, defaults to `[False, True, True, True]`):
+            Indicates whether to downsample the feature maps at each stage.
+            If `True`, the spatial dimensions of the feature maps will be reduced.
+        stage_light_block (`list[bool]`, *optional*, defaults to `[False, False, True, True]`):
+            Indicates whether to use light blocks in each stage.
+            Light blocks are a variant of convolutional blocks that may have fewer parameters.
+        stage_kernel_size (`list[int]`, *optional*, defaults to `[3, 3, 5, 5]`):
+            Kernel sizes for the convolutional layers in each stage.
+        stage_numb_of_layers (`list[int]`, *optional*, defaults to `[6, 6, 6, 6]`):
+            Number of layers to be used in each block of the stage.
+        use_learnable_affine_block (`bool`, *optional*, defaults to `False`):
+            Whether to use Learnable Affine Blocks (LAB) in the network.
+            LAB adds learnable scale and bias parameters after certain operations.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "hgnet_v2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        depths=[3, 4, 6, 3],
+        hidden_sizes=[256, 512, 1024, 2048],
+        hidden_act="relu",
+        out_features=None,
+        out_indices=None,
+        stem_channels=[3, 32, 48],
+        stage_in_channels=[48, 128, 512, 1024],
+        stage_mid_channels=[48, 96, 192, 384],
+        stage_out_channels=[128, 512, 1024, 2048],
+        stage_num_blocks=[1, 1, 3, 1],
+        stage_downsample=[False, True, True, True],
+        stage_light_block=[False, False, True, True],
+        stage_kernel_size=[3, 3, 5, 5],
+        stage_numb_of_layers=[6, 6, 6, 6],
+        use_learnable_affine_block=False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.hidden_act = hidden_act
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.stem_channels = stem_channels
+        self.stage_in_channels = stage_in_channels
+        self.stage_mid_channels = stage_mid_channels
+        self.stage_out_channels = stage_out_channels
+        self.stage_num_blocks = stage_num_blocks
+        self.stage_downsample = stage_downsample
+        self.stage_light_block = stage_light_block
+        self.stage_kernel_size = stage_kernel_size
+        self.stage_numb_of_layers = stage_numb_of_layers
+        self.use_learnable_affine_block = use_learnable_affine_block
+        self.initializer_range = initializer_range
+
+        if not (
+            len(stage_in_channels)
+            == len(stage_mid_channels)
+            == len(stage_out_channels)
+            == len(stage_num_blocks)
+            == len(stage_downsample)
+            == len(stage_light_block)
+            == len(stage_kernel_size)
+            == len(stage_numb_of_layers)
+        ):
+            raise ValueError("All stage configuration lists must have the same length.")
+
+
+__all__ = ["HGNetV2Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/modeling_hgnet_v2.py b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/modeling_hgnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c042dadf176d2a142e066e527df2f92e9a5b66b5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/modeling_hgnet_v2.py
@@ -0,0 +1,495 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hgnet_v2/modular_hgnet_v2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hgnet_v2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_hgnet_v2 import HGNetV2Config
+
+
+# General docstring
+
+
+@auto_docstring
+class HGNetV2PreTrainedModel(PreTrainedModel):
+    config: HGNetV2Config
+    base_model_prefix = "hgnetv2"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["HGNetV2BasicLayer"]
+
+
+class HGNetV2LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.scale * hidden_state + self.bias
+        return hidden_state
+
+
+class HGNetV2ConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        activation: str = "relu",
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=groups,
+            padding=(kernel_size - 1) // 2,
+            bias=False,
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+        if activation and use_learnable_affine_block:
+            self.lab = HGNetV2LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.lab(hidden_state)
+        return hidden_state
+
+
+class HGNetV2ConvLayerLight(nn.Module):
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int, use_learnable_affine_block: bool = False
+    ):
+        super().__init__()
+        self.conv1 = HGNetV2ConvLayer(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            activation=None,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.conv2 = HGNetV2ConvLayer(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.conv1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Embeddings(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+
+        self.stem1 = HGNetV2ConvLayer(
+            config.stem_channels[0],
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2a = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[1] // 2,
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2b = HGNetV2ConvLayer(
+            config.stem_channels[1] // 2,
+            config.stem_channels[1],
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem3 = HGNetV2ConvLayer(
+            config.stem_channels[1] * 2,
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem4 = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[2],
+            kernel_size=1,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embedding = self.stem1(pixel_values)
+        embedding = F.pad(embedding, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2a(embedding)
+        emb_stem_2a = F.pad(emb_stem_2a, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2b(emb_stem_2a)
+        pooled_emb = self.pool(embedding)
+        embedding = torch.cat([pooled_emb, emb_stem_2a], dim=1)
+        embedding = self.stem3(embedding)
+        embedding = self.stem4(embedding)
+        return embedding
+
+
+class HGNetV2BasicLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        middle_channels: int,
+        out_channels: int,
+        layer_num: int,
+        kernel_size: int = 3,
+        residual: bool = False,
+        light_block: bool = False,
+        drop_path: float = 0.0,
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__()
+        self.residual = residual
+
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            temp_in_channels = in_channels if i == 0 else middle_channels
+            if light_block:
+                block = HGNetV2ConvLayerLight(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            else:
+                block = HGNetV2ConvLayer(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                    stride=1,
+                )
+            self.layers.append(block)
+
+        # feature aggregation
+        total_channels = in_channels + layer_num * middle_channels
+        aggregation_squeeze_conv = HGNetV2ConvLayer(
+            total_channels,
+            out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        aggregation_excitation_conv = HGNetV2ConvLayer(
+            out_channels // 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.aggregation = nn.Sequential(
+            aggregation_squeeze_conv,
+            aggregation_excitation_conv,
+        )
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        identity = hidden_state
+        output = [hidden_state]
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+            output.append(hidden_state)
+        hidden_state = torch.cat(output, dim=1)
+        hidden_state = self.aggregation(hidden_state)
+        if self.residual:
+            hidden_state = self.drop_path(hidden_state) + identity
+        return hidden_state
+
+
+class HGNetV2Stage(nn.Module):
+    def __init__(self, config: HGNetV2Config, stage_index: int, drop_path: float = 0.0):
+        super().__init__()
+        in_channels = config.stage_in_channels[stage_index]
+        mid_channels = config.stage_mid_channels[stage_index]
+        out_channels = config.stage_out_channels[stage_index]
+        num_blocks = config.stage_num_blocks[stage_index]
+        num_layers = config.stage_numb_of_layers[stage_index]
+        downsample = config.stage_downsample[stage_index]
+        light_block = config.stage_light_block[stage_index]
+        kernel_size = config.stage_kernel_size[stage_index]
+        use_learnable_affine_block = config.use_learnable_affine_block
+
+        if downsample:
+            self.downsample = HGNetV2ConvLayer(
+                in_channels, in_channels, kernel_size=3, stride=2, groups=in_channels, activation=None
+            )
+        else:
+            self.downsample = nn.Identity()
+
+        blocks_list = []
+        for i in range(num_blocks):
+            blocks_list.append(
+                HGNetV2BasicLayer(
+                    in_channels if i == 0 else out_channels,
+                    mid_channels,
+                    out_channels,
+                    num_layers,
+                    residual=(i != 0),
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    drop_path=drop_path,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            )
+        self.blocks = nn.ModuleList(blocks_list)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.downsample(hidden_state)
+        for block in self.blocks:
+            hidden_state = block(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Encoder(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        for stage_index in range(len(config.stage_in_channels)):
+            resnet_stage = HGNetV2Stage(config, stage_index)
+            self.stages.append(resnet_stage)
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
+    has_attentions = False
+
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.depths = config.depths
+        self.num_features = [config.embedding_size] + config.hidden_sizes
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import HGNetV2Config, HGNetV2Backbone
+        >>> import torch
+
+        >>> config = HGNetV2Config()
+        >>> model = HGNetV2Backbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 2048, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embedder(pixel_values)
+
+        outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    HGNetV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # classification head
+        self.classifier = nn.ModuleList([self.avg_pool, self.flatten])
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> import requests
+        >>> from transformers import HGNetV2ForImageClassification, AutoImageProcessor
+        >>> from PIL import Image
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> model = HGNetV2ForImageClassification.from_pretrained("ustc-community/hgnet-v2")
+        >>> processor = AutoImageProcessor.from_pretrained("ustc-community/hgnet-v2")
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> outputs.logits.shape
+        torch.Size([1, 2])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        embedding_output = self.embedder(pixel_values)
+        outputs = self.encoder(embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        last_hidden_state = outputs[0]
+        for layer in self.classifier:
+            last_hidden_state = layer(last_hidden_state)
+        logits = self.fc(last_hidden_state)
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["HGNetV2Backbone", "HGNetV2PreTrainedModel", "HGNetV2ForImageClassification"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hgnet_v2/modular_hgnet_v2.py b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/modular_hgnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bfd2313470149acbb1c01bac553978c3d2a5233
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hgnet_v2/modular_hgnet_v2.py
@@ -0,0 +1,618 @@
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutputWithNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    auto_docstring,
+)
+from ...utils.backbone_utils import BackboneConfigMixin, BackboneMixin, get_aligned_output_features_output_indices
+from ..rt_detr.modeling_rt_detr_resnet import RTDetrResNetConvLayer
+
+
+# TODO: Modular conversion for resnet must be fixed as
+# it provides incorrect import for configuration like resnet_resnet
+class HGNetV2Config(BackboneConfigMixin, PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`HGNetV2Backbone`]. It is used to instantiate a HGNet-V2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of D-FINE-X-COCO B4 "[ustc-community/dfine_x_coco"](https://huggingface.co/ustc-community/dfine_x_coco").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        stem_channels (`list[int]`, *optional*, defaults to `[3, 32, 48]`):
+            Channel dimensions for the stem layers:
+            - First number (3) is input image channels
+            - Second number (32) is intermediate stem channels
+            - Third number (48) is output stem channels
+        stage_in_channels (`list[int]`, *optional*, defaults to `[48, 128, 512, 1024]`):
+            Input channel dimensions for each stage of the backbone.
+            This defines how many channels the input to each stage will have.
+        stage_mid_channels (`list[int]`, *optional*, defaults to `[48, 96, 192, 384]`):
+            Mid-channel dimensions for each stage of the backbone.
+            This defines the number of channels used in the intermediate layers of each stage.
+        stage_out_channels (`list[int]`, *optional*, defaults to `[128, 512, 1024, 2048]`):
+            Output channel dimensions for each stage of the backbone.
+            This defines how many channels the output of each stage will have.
+        stage_num_blocks (`list[int]`, *optional*, defaults to `[1, 1, 3, 1]`):
+            Number of blocks to be used in each stage of the backbone.
+            This controls the depth of each stage by specifying how many convolutional blocks to stack.
+        stage_downsample (`list[bool]`, *optional*, defaults to `[False, True, True, True]`):
+            Indicates whether to downsample the feature maps at each stage.
+            If `True`, the spatial dimensions of the feature maps will be reduced.
+        stage_light_block (`list[bool]`, *optional*, defaults to `[False, False, True, True]`):
+            Indicates whether to use light blocks in each stage.
+            Light blocks are a variant of convolutional blocks that may have fewer parameters.
+        stage_kernel_size (`list[int]`, *optional*, defaults to `[3, 3, 5, 5]`):
+            Kernel sizes for the convolutional layers in each stage.
+        stage_numb_of_layers (`list[int]`, *optional*, defaults to `[6, 6, 6, 6]`):
+            Number of layers to be used in each block of the stage.
+        use_learnable_affine_block (`bool`, *optional*, defaults to `False`):
+            Whether to use Learnable Affine Blocks (LAB) in the network.
+            LAB adds learnable scale and bias parameters after certain operations.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "hgnet_v2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        depths=[3, 4, 6, 3],
+        hidden_sizes=[256, 512, 1024, 2048],
+        hidden_act="relu",
+        out_features=None,
+        out_indices=None,
+        stem_channels=[3, 32, 48],
+        stage_in_channels=[48, 128, 512, 1024],
+        stage_mid_channels=[48, 96, 192, 384],
+        stage_out_channels=[128, 512, 1024, 2048],
+        stage_num_blocks=[1, 1, 3, 1],
+        stage_downsample=[False, True, True, True],
+        stage_light_block=[False, False, True, True],
+        stage_kernel_size=[3, 3, 5, 5],
+        stage_numb_of_layers=[6, 6, 6, 6],
+        use_learnable_affine_block=False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.hidden_act = hidden_act
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.stem_channels = stem_channels
+        self.stage_in_channels = stage_in_channels
+        self.stage_mid_channels = stage_mid_channels
+        self.stage_out_channels = stage_out_channels
+        self.stage_num_blocks = stage_num_blocks
+        self.stage_downsample = stage_downsample
+        self.stage_light_block = stage_light_block
+        self.stage_kernel_size = stage_kernel_size
+        self.stage_numb_of_layers = stage_numb_of_layers
+        self.use_learnable_affine_block = use_learnable_affine_block
+        self.initializer_range = initializer_range
+
+        if not (
+            len(stage_in_channels)
+            == len(stage_mid_channels)
+            == len(stage_out_channels)
+            == len(stage_num_blocks)
+            == len(stage_downsample)
+            == len(stage_light_block)
+            == len(stage_kernel_size)
+            == len(stage_numb_of_layers)
+        ):
+            raise ValueError("All stage configuration lists must have the same length.")
+
+
+# General docstring
+
+
+@auto_docstring
+class HGNetV2PreTrainedModel(PreTrainedModel):
+    config: HGNetV2Config
+    base_model_prefix = "hgnetv2"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["HGNetV2BasicLayer"]
+
+
+class HGNetV2LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.scale * hidden_state + self.bias
+        return hidden_state
+
+
+class HGNetV2ConvLayer(RTDetrResNetConvLayer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        activation: str = "relu",
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__(in_channels, out_channels, kernel_size, stride, activation)
+        self.convolution = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=groups,
+            padding=(kernel_size - 1) // 2,
+            bias=False,
+        )
+        if activation and use_learnable_affine_block:
+            self.lab = HGNetV2LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.lab(hidden_state)
+        return hidden_state
+
+
+class HGNetV2ConvLayerLight(nn.Module):
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int, use_learnable_affine_block: bool = False
+    ):
+        super().__init__()
+        self.conv1 = HGNetV2ConvLayer(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            activation=None,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.conv2 = HGNetV2ConvLayer(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.conv1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Embeddings(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+
+        self.stem1 = HGNetV2ConvLayer(
+            config.stem_channels[0],
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2a = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[1] // 2,
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2b = HGNetV2ConvLayer(
+            config.stem_channels[1] // 2,
+            config.stem_channels[1],
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem3 = HGNetV2ConvLayer(
+            config.stem_channels[1] * 2,
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem4 = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[2],
+            kernel_size=1,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embedding = self.stem1(pixel_values)
+        embedding = F.pad(embedding, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2a(embedding)
+        emb_stem_2a = F.pad(emb_stem_2a, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2b(emb_stem_2a)
+        pooled_emb = self.pool(embedding)
+        embedding = torch.cat([pooled_emb, emb_stem_2a], dim=1)
+        embedding = self.stem3(embedding)
+        embedding = self.stem4(embedding)
+        return embedding
+
+
+class HGNetV2BasicLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        middle_channels: int,
+        out_channels: int,
+        layer_num: int,
+        kernel_size: int = 3,
+        residual: bool = False,
+        light_block: bool = False,
+        drop_path: float = 0.0,
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__()
+        self.residual = residual
+
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            temp_in_channels = in_channels if i == 0 else middle_channels
+            if light_block:
+                block = HGNetV2ConvLayerLight(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            else:
+                block = HGNetV2ConvLayer(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                    stride=1,
+                )
+            self.layers.append(block)
+
+        # feature aggregation
+        total_channels = in_channels + layer_num * middle_channels
+        aggregation_squeeze_conv = HGNetV2ConvLayer(
+            total_channels,
+            out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        aggregation_excitation_conv = HGNetV2ConvLayer(
+            out_channels // 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.aggregation = nn.Sequential(
+            aggregation_squeeze_conv,
+            aggregation_excitation_conv,
+        )
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        identity = hidden_state
+        output = [hidden_state]
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+            output.append(hidden_state)
+        hidden_state = torch.cat(output, dim=1)
+        hidden_state = self.aggregation(hidden_state)
+        if self.residual:
+            hidden_state = self.drop_path(hidden_state) + identity
+        return hidden_state
+
+
+class HGNetV2Stage(nn.Module):
+    def __init__(self, config: HGNetV2Config, stage_index: int, drop_path: float = 0.0):
+        super().__init__()
+        in_channels = config.stage_in_channels[stage_index]
+        mid_channels = config.stage_mid_channels[stage_index]
+        out_channels = config.stage_out_channels[stage_index]
+        num_blocks = config.stage_num_blocks[stage_index]
+        num_layers = config.stage_numb_of_layers[stage_index]
+        downsample = config.stage_downsample[stage_index]
+        light_block = config.stage_light_block[stage_index]
+        kernel_size = config.stage_kernel_size[stage_index]
+        use_learnable_affine_block = config.use_learnable_affine_block
+
+        if downsample:
+            self.downsample = HGNetV2ConvLayer(
+                in_channels, in_channels, kernel_size=3, stride=2, groups=in_channels, activation=None
+            )
+        else:
+            self.downsample = nn.Identity()
+
+        blocks_list = []
+        for i in range(num_blocks):
+            blocks_list.append(
+                HGNetV2BasicLayer(
+                    in_channels if i == 0 else out_channels,
+                    mid_channels,
+                    out_channels,
+                    num_layers,
+                    residual=(i != 0),
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    drop_path=drop_path,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            )
+        self.blocks = nn.ModuleList(blocks_list)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.downsample(hidden_state)
+        for block in self.blocks:
+            hidden_state = block(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Encoder(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        for stage_index in range(len(config.stage_in_channels)):
+            resnet_stage = HGNetV2Stage(config, stage_index)
+            self.stages.append(resnet_stage)
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
+    has_attentions = False
+
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.depths = config.depths
+        self.num_features = [config.embedding_size] + config.hidden_sizes
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import HGNetV2Config, HGNetV2Backbone
+        >>> import torch
+
+        >>> config = HGNetV2Config()
+        >>> model = HGNetV2Backbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 2048, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embedder(pixel_values)
+
+        outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    HGNetV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # classification head
+        self.classifier = nn.ModuleList([self.avg_pool, self.flatten])
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> import requests
+        >>> from transformers import HGNetV2ForImageClassification, AutoImageProcessor
+        >>> from PIL import Image
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> model = HGNetV2ForImageClassification.from_pretrained("ustc-community/hgnet-v2")
+        >>> processor = AutoImageProcessor.from_pretrained("ustc-community/hgnet-v2")
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> outputs.logits.shape
+        torch.Size([1, 2])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        embedding_output = self.embedder(pixel_values)
+        outputs = self.encoder(embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        last_hidden_state = outputs[0]
+        for layer in self.classifier:
+            last_hidden_state = layer(last_hidden_state)
+        logits = self.fc(last_hidden_state)
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["HGNetV2Config", "HGNetV2Backbone", "HGNetV2PreTrainedModel", "HGNetV2ForImageClassification"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hiera/__init__.py b/phivenv/Lib/site-packages/transformers/models/hiera/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..841f13be4c0d2f48f54eecc916acd826395449af
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hiera/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hiera import *
+    from .modeling_hiera import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f5bfe6a154424fd03ccc6f7a4fa9319f5613e1d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/configuration_hiera.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/configuration_hiera.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04d36dcaceb70d9d174c7ffcd3483f46d7d9fb60
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/configuration_hiera.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/modeling_hiera.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/modeling_hiera.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..751cf4c182a705943e4687d829d19c299e3936eb
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hiera/__pycache__/modeling_hiera.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hiera/configuration_hiera.py b/phivenv/Lib/site-packages/transformers/models/hiera/configuration_hiera.py
new file mode 100644
index 0000000000000000000000000000000000000000..2342d7e562a50de0c0937040a8e8279c7860e931
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hiera/configuration_hiera.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Hiera model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class HieraConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Hiera
+    [facebook/hiera-base-224](https://huggingface.co/facebook/hiera-base-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        image_size (`list(int)`, *optional*, defaults to `[224, 224]`):
+            The size (resolution) of input in the format (height, width) for images
+            and (frames, height, width) for videos.
+        patch_size (`list(int)`, *optional*, defaults to `[7, 7]`):
+            The size (resolution) of each patch.
+        patch_stride (`list(int)`, *optional*, defaults to `[4, 4]`):
+            The stride of the patch.
+        patch_padding (`list(int)`, *optional*, defaults to `[3, 3]`):
+            The padding of the patch.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            The ratio of mlp hidden dim to embedding dim.
+        depths (`list(int)`, *optional*, defaults to `[2, 3, 16, 3]`):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[1, 2, 4, 8]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        embed_dim_multiplier (`float`, *optional*, defaults to 2.0):
+            The multiplier to the dimensionality of patch embedding in each layer of the Transformer encoder.
+        num_query_pool (`int`, *optional*, defaults to 3):
+            The number of query pool stages.
+        query_stride (`list(int)`, *optional*, defaults to `[2, 2]`):
+            The stride of the query pool.
+        masked_unit_size (`list(int)`, *optional*, defaults to `[8, 8]`):
+            The size of the masked unit.
+        masked_unit_attention (`list(bool)`, *optional*, defaults to `[True, True, False, False]`):
+            Whether to use masked unit attention in each layer of the Transformer encoder.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop path rate.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices and
+            the zero_initializer for initializing all bias vectors.
+        layer_norm_init (`float`, *optional*, defaults to 1.0):
+            The initial weight value for layer normalization layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        decoder_hidden_size (`int`, *optional*):
+            Dimensionality of decoder embeddings for MAE pretraining.
+        decoder_depth (`int`, *optional*):
+            Depth of the decoder for MAE pretraining.
+        decoder_num_heads (`int`, *optional*):
+            Number of attention heads in each layer of the decoder for MAE pretraining.
+        normalize_pixel_loss (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the pixel loss by the number of pixels.
+        mask_ratio (`float`, *optional*, defaults to 0.6):
+            The ratio of masked tokens in the input.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import HieraConfig, HieraModel
+
+    >>> # Initializing a Hiera hiera-base-patch16-224 style configuration
+    >>> configuration = HieraConfig()
+
+    >>> # Initializing a model (with random weights) from the hiera-base-patch16-224 style configuration
+    >>> model = HieraModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "hiera"
+
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        embed_dim=96,
+        image_size=[224, 224],
+        patch_size=[7, 7],
+        patch_stride=[4, 4],
+        patch_padding=[3, 3],
+        mlp_ratio=4.0,
+        depths=[2, 3, 16, 3],
+        num_heads=[1, 2, 4, 8],
+        embed_dim_multiplier=2.0,
+        num_query_pool=3,
+        query_stride=[2, 2],
+        masked_unit_size=[8, 8],
+        masked_unit_attention=[True, True, False, False],
+        drop_path_rate=0.0,
+        num_channels=3,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_init=1.0,
+        layer_norm_eps=1e-6,
+        decoder_hidden_size=None,
+        decoder_depth=None,
+        decoder_num_heads=None,
+        normalize_pixel_loss=True,
+        mask_ratio=0.6,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if masked_unit_size[0] % query_stride[0] ** (len(depths) - 1) != 0:
+            raise ValueError(
+                f"masked_unit_size[0] ({masked_unit_size[0]}) must be divisible by query_stride[0] ({query_stride[0]}) "
+                f"raised to the power of the number of layers ({len(depths) - 1})"
+            )
+
+        if num_query_pool >= len(depths):
+            raise ValueError(
+                f"num_query_pool ({num_query_pool}) must be less than the number of layers ({len(depths)})"
+            )
+
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.mlp_ratio = mlp_ratio
+        self.depths = depths
+        self.num_heads = num_heads
+        self.num_layers = len(depths)
+        self.embed_dim_multiplier = embed_dim_multiplier
+        self.num_query_pool = num_query_pool
+        self.query_stride = query_stride
+        self.masked_unit_size = masked_unit_size
+        self.masked_unit_attention = masked_unit_attention
+        self.drop_path_rate = drop_path_rate
+        self.num_channels = num_channels
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_init = layer_norm_init
+        self.layer_norm_eps = layer_norm_eps
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_depth = decoder_depth
+        self.decoder_num_heads = decoder_num_heads
+        self.normalize_pixel_loss = normalize_pixel_loss
+        self.mask_ratio = mask_ratio
+        # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * embed_dim_multiplier ** (len(depths) - 1))
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["HieraConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hiera/modeling_hiera.py b/phivenv/Lib/site-packages/transformers/models/hiera/modeling_hiera.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fcd827e89ce987a8cd8999da84311bef5a6cbe3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hiera/modeling_hiera.py
@@ -0,0 +1,1462 @@
+# coding=utf-8
+# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Hiera model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+    ModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging, torch_int
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_hiera import HieraConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Hiera encoder's outputs, with potential hidden states and attentions.
+    """
+)
+class HieraEncoderOutput(ModelOutput):
+    r"""
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Hiera model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+class HieraModelOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+        Average pooling of the last layer hidden-state.
+    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+        Tensor indicating which patches are masked (0) and which are not (1).
+    ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+        Tensor containing the original index of the (shuffled) masked patches.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    bool_masked_pos: torch.BoolTensor = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Hiera image classification outputs.
+    """
+)
+class HieraForImageClassificationOutput(ImageClassifierOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
+        Loss value for the training task.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
+        Prediction scores of the classification head (logits of the output layer).
+    hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+    attentions (`tuple(torch.FloatTensor)`, `optional`):
+        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`.
+
+        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+        heads.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for HieraForPreTraining's outputs, with potential hidden states and attentions.
+    """
+)
+class HieraForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`):
+        Pixel reconstruction loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+        Pixel reconstruction logits.
+    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+        Tensor indicating which patches are masked (0) and which are not (1).
+    ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+        Tensor containing the original index of the (shuffled) masked patches.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+        shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
+        plus the initial embedding outputs reshaped to include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    bool_masked_pos: torch.BoolTensor = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class HieraPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, is_mae: bool = False):
+        super().__init__()
+
+        # Support any number of spatial dimensions
+        self.spatial_dims = len(config.patch_size)
+        if self.spatial_dims != 2:
+            raise ValueError(f"The number of dimensions of the input image should be 2, but got {self.spatial_dims}.")
+        self.num_channels = config.num_channels
+        self.image_size = config.image_size[-2:]
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
+        self.mask_ratio = config.mask_ratio
+        self.is_mae = is_mae
+        self.projection = nn.Conv2d(
+            self.num_channels,
+            config.embed_dim,
+            kernel_size=config.patch_size,
+            stride=config.patch_stride,
+            padding=config.patch_padding,
+        )
+
+    def masked_conv(
+        self, pixel_values: torch.FloatTensor, bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> torch.Tensor:
+        """Zero-out the masked regions of the input before conv.
+        Prevents leakage of masked regions when using overlapping kernels.
+        """
+        if bool_masked_pos is None:
+            return self.projection(pixel_values)
+
+        target_size = pixel_values.shape[2:]
+        # Reshape bool_masked_pos to (batch_size, 1, mask_unit_height, mask_unit_width)
+        bool_masked_pos = bool_masked_pos.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)
+
+        bool_masked_pos = nn.functional.interpolate(bool_masked_pos.float(), size=target_size)
+
+        return self.projection(pixel_values * bool_masked_pos)
+
+    def random_masking(
+        self, pixel_values: torch.FloatTensor, noise: Optional[torch.FloatTensor] = None
+    ) -> tuple[torch.BoolTensor, torch.LongTensor]:
+        """
+        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
+        noise.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`)
+            noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        batch_size = pixel_values.shape[0]
+        # Tokens selected for masking at mask unit level
+        num_windows = math.prod(self.mask_spatial_shape)
+        len_keep = int(num_windows * (1 - self.mask_ratio))
+
+        if noise is None:
+            noise = torch.rand(batch_size, num_windows, device=pixel_values.device)
+
+        # Sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)
+        # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1).to(pixel_values.device)
+
+        # Generate the binary bool_masked_pos: 1 is *keep*, 0 is *remove*
+        # Note this is opposite to original MAE
+        bool_masked_pos = torch.zeros([batch_size, num_windows], device=pixel_values.device)
+        bool_masked_pos[:, :len_keep] = 1
+        # Unshuffle to get the binary bool_masked_pos
+        bool_masked_pos = torch.gather(bool_masked_pos, dim=1, index=ids_restore).bool()
+
+        return bool_masked_pos, ids_restore
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
+        (bool_masked_pos, ids_restore) = (
+            self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None)
+        )
+
+        embeddings = self.masked_conv(pixel_values, bool_masked_pos)
+        embeddings = embeddings.flatten(2).transpose(2, 1)
+
+        return embeddings, bool_masked_pos, ids_restore
+
+
+class HieraEmbeddings(nn.Module):
+    """
+    Construct position and patch embeddings.
+    """
+
+    def __init__(self, config: HieraConfig, is_mae: bool = False) -> None:
+        super().__init__()
+        self.patch_stride = config.patch_stride
+        tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        self.mask_spatial_shape = [i // s for i, s in zip(tokens_spatial_shape, config.masked_unit_size)]
+        self.num_tokens = math.prod(tokens_spatial_shape)
+        self.is_mae = is_mae
+
+        self.patch_embeddings = HieraPatchEmbeddings(config, is_mae=is_mae)
+
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.embed_dim))
+
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, pos_embeds: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing, no class embeddings, and different patch strides.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = pos_embeds.shape[1]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return pos_embeds
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_stride[0]
+        new_width = width // self.patch_stride[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        pos_embeds = pos_embeds.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        pos_embeds = pos_embeds.permute(0, 3, 1, 2)
+
+        pos_embeds = nn.functional.interpolate(
+            pos_embeds,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        pos_embeds = pos_embeds.permute(0, 2, 3, 1).view(1, -1, dim)
+        return pos_embeds
+
+    def get_position_embedding(
+        self, embeddings: torch.Tensor, height: int, width: int, interpolate_pos_encoding: bool
+    ) -> torch.FloatTensor:
+        return (
+            self.interpolate_pos_encoding(embeddings, self.position_embeddings, height, width)
+            if interpolate_pos_encoding
+            else self.position_embeddings
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        noise: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
+        height, width = pixel_values.shape[-2:]
+        embeddings, bool_masked_pos, ids_restore = self.patch_embeddings(pixel_values, noise=noise)
+        embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding)
+        return embeddings, bool_masked_pos, ids_restore
+
+
+class HieraMaskUnitAttention(nn.Module):
+    """
+    Computes either Mask Unit or Global Attention. Also is able to perform query pooling.
+
+    Note: this assumes the tokens have already been flattened and unrolled into mask units.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_size_output: int,
+        num_heads: int,
+        query_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.query_stride = query_stride
+        self.hidden_size_output = hidden_size_output
+
+        self.head_dim = hidden_size_output // num_heads
+        self.scale = (self.head_dim) ** -0.5
+
+        self.qkv = nn.Linear(hidden_size, 3 * hidden_size_output)
+        self.proj = nn.Linear(hidden_size_output, hidden_size_output)
+
+        self.window_size = window_size
+        self.use_mask_unit_attn = use_mask_unit_attn
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input should be of shape [batch, tokens, channels]."""
+        batch_size, seq_len, _ = hidden_states.shape
+
+        num_windows = 1
+        if self.use_mask_unit_attn:
+            num_windows = seq_len // (self.query_stride * self.window_size)
+
+        qkv = self.qkv(hidden_states)
+        qkv = qkv.reshape(batch_size, -1, num_windows, 3, self.num_heads, self.head_dim)
+        qkv = qkv.permute(3, 0, 4, 2, 1, 5)
+
+        query, key, value = qkv.unbind(0)
+
+        if self.query_stride > 1:
+            # Refer to unroll to see how this performs a maxpool-Nd
+            query = query.view(batch_size, self.num_heads, num_windows, self.query_stride, -1, self.head_dim)
+            query = query.max(dim=3).values
+
+        attn_weights = (query * self.scale) @ key.transpose(-1, -2)
+        attn_weights = attn_weights.softmax(dim=-1)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = attn_weights @ value
+        attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.hidden_size_output)
+        attn_output = self.proj(attn_output)
+
+        return (attn_output, attn_weights) if output_attentions else (attn_output, None)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Hiera
+class HieraDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class HieraMlp(nn.Module):
+    def __init__(self, config, dim: int) -> None:
+        super().__init__()
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(dim, int(dim * config.mlp_ratio))
+        self.fc2 = nn.Linear(int(dim * config.mlp_ratio), dim)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class HieraLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        hidden_size_output: int,
+        num_heads: int,
+        drop_path: float = 0.0,
+        query_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.hidden_size_output = hidden_size_output
+        self.query_stride = query_stride
+
+        self.layernorm_before = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.attn = HieraMaskUnitAttention(
+            hidden_size=hidden_size,
+            hidden_size_output=hidden_size_output,
+            num_heads=num_heads,
+            query_stride=query_stride,
+            window_size=window_size,
+            use_mask_unit_attn=use_mask_unit_attn,
+        )
+
+        self.layernorm_after = nn.LayerNorm(hidden_size_output, eps=config.layer_norm_eps)
+        self.mlp = HieraMlp(config, hidden_size_output)
+
+        self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity()
+        if hidden_size != hidden_size_output:
+            self.proj = nn.Linear(hidden_size, hidden_size_output)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        batch_size, seq_len, _ = hidden_states.shape
+        # Attention + Q Pooling
+        hidden_states_norm = self.layernorm_before(hidden_states)
+        if self.hidden_size != self.hidden_size_output:
+            hidden_states = self.proj(hidden_states_norm)
+            # Refer to unroll to see how this performs a maxpool-Nd
+            hidden_states = (
+                hidden_states.view(batch_size, self.query_stride, -1, self.hidden_size_output).max(dim=1).values
+            )
+
+        (hidden_states_norm, attn_weights) = self.attn(
+            hidden_states_norm, head_mask, output_attentions=output_attentions
+        )
+        hidden_states = hidden_states + self.drop_path(hidden_states_norm)
+
+        residual = hidden_states
+        hidden_states = self.layernorm_after(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.drop_path(hidden_states)
+
+        return (hidden_states, attn_weights)
+
+
+class HieraStage(GradientCheckpointingLayer):
+    def __init__(
+        self,
+        config,
+        depth: int,
+        hidden_size: int,
+        hidden_size_output: int,
+        num_heads: int,
+        drop_path: list[float],
+        query_stride: list[int],
+        window_size: int,
+        use_mask_unit_attn: bool,
+        stage_num: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        # we need to know if the previous stage used masked attention
+        # mask unit or global attention.
+        # lag by 1 layer, so that global attention,
+        # applied post pooling on lower resolution
+        previous_stage_used_masked_attention = False
+        if stage_num is not None:
+            previous_stage_used_masked_attention = config.masked_unit_attention[stage_num - 1 if stage_num > 0 else 0]
+        self.layers = nn.ModuleList(
+            [
+                HieraLayer(
+                    config=config,
+                    hidden_size=hidden_size if i == 0 else hidden_size_output,
+                    hidden_size_output=hidden_size_output,
+                    num_heads=num_heads,
+                    drop_path=drop_path[i],
+                    query_stride=query_stride[i],
+                    window_size=window_size,
+                    use_mask_unit_attn=use_mask_unit_attn or (previous_stage_used_masked_attention and i == 0),
+                )
+                for i in range(depth)
+            ]
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.FloatTensor], output_attentions: bool = False
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            (hidden_states, attn_weights) = layer_module(
+                hidden_states, layer_head_mask, output_attentions=output_attentions
+            )
+
+        return hidden_states, attn_weights
+
+
+def undo_windowing(hidden_states: torch.Tensor, shape: list[int], mask_unit_shape: list[int]) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        hidden_states (`torch.Tensor`): The hidden states tensor of shape `[batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]`.
+        shape (`list[int]`): The original shape of the hidden states tensor before windowing.
+        mask_unit_shape (`list[int]`): The shape of the mask units used for windowing.
+
+    Returns:
+        torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size].
+    """
+    batch_size, hidden_size = hidden_states.shape[0], hidden_states.shape[-1]
+    # From: [batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]
+    # To: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
+    num_mask_units = [s // mu for s, mu in zip(shape, mask_unit_shape)]
+    hidden_states = hidden_states.view(batch_size, *num_mask_units, *mask_unit_shape, hidden_size)
+
+    # From: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
+    # To: [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size]
+    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5)
+    hidden_states = hidden_states.reshape(batch_size, *shape, hidden_size)
+
+    return hidden_states
+
+
+class HieraEncoder(nn.Module):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__()
+        total_depth = sum(config.depths)
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, total_depth, device="cpu")]
+        # query strides rule
+        cumulative_depths = torch.tensor(config.depths, device="cpu").cumsum(0).tolist()
+        query_pool_layer = cumulative_depths[: config.num_query_pool]
+        query_strides = [math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(total_depth)]
+
+        # Transformer blocks
+        self.stages = nn.ModuleList()
+        hidden_size = config.embed_dim
+        stage_ends = [0] + cumulative_depths
+        masked_unit_area = math.prod(config.masked_unit_size)
+        query_stride_area = math.prod(config.query_stride)
+        for idx_stage, depth in enumerate(config.depths):
+            hidden_size_output = int(config.embed_dim * config.embed_dim_multiplier**idx_stage)
+
+            stage = HieraStage(
+                config=config,
+                depth=depth,
+                hidden_size=hidden_size,
+                hidden_size_output=hidden_size_output,
+                num_heads=config.num_heads[idx_stage],
+                drop_path=dpr[stage_ends[idx_stage] : stage_ends[idx_stage + 1]],
+                query_stride=query_strides[stage_ends[idx_stage] : stage_ends[idx_stage + 1]],
+                window_size=int(masked_unit_area * query_stride_area**-idx_stage),
+                use_mask_unit_attn=config.masked_unit_attention[idx_stage],
+                stage_num=idx_stage,
+            )
+
+            hidden_size = hidden_size_output
+            self.stages.append(stage)
+
+        # Setting reroll schedule
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        stage_size = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+
+        self.schedule = {}
+        for idx_stage in range(len(config.depths)):
+            self.schedule[idx_stage] = unroll_schedule, stage_size
+            if idx_stage < config.num_query_pool:
+                stage_size = [i // s for i, s in zip(stage_size, config.query_stride)]
+                unroll_schedule = unroll_schedule[1:]
+
+        self.gradient_checkpointing = False
+
+    def reroll(
+        self, hidden_states: torch.Tensor, stage_idx: int, bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no bool_masked_pos is provided returns:
+            - [batch_size, height, width, hidden_size]
+        If a bool_masked_pos is provided returns:
+            - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        """
+        schedule, size = self.schedule[stage_idx]
+        batch_size, seq_len, hidden_size = hidden_states.shape
+
+        num_dim = len(size)
+        mask_unit_shape = [1] * num_dim
+
+        for strides in schedule:
+            # Extract the current patch from seq_len
+            hidden_states = hidden_states.view(
+                batch_size, *strides, seq_len // math.prod(strides), *mask_unit_shape, hidden_size
+            )
+
+            # Move that patch into the current MU
+            # Input: [batch_size, stride, stride, seq_len//(stride*stride), mask_unit_height, mask_unit_width, hidden_size]
+            # Output: [batch_size, seq_len//(stride*stride), stride, mask_unit_height, stride, mask_unit_width, hidden_size]
+            hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5, 6)
+
+            # Reshape to [batch_size, seq_len//(stride*stride), *mask_units, hidden_size]
+            for i in range(num_dim):
+                mask_unit_shape[i] *= strides[i]
+            hidden_states = hidden_states.reshape(batch_size, -1, *mask_unit_shape, hidden_size)
+            seq_len = hidden_states.shape[1]
+
+        # Current shape (e.g., 2d: [batch_size, #num_mask_units_height*#num_mask_units_width, mask_unit_height, mask_unit_width, hidden_size])
+        hidden_states = hidden_states.view(batch_size, seq_len, *mask_unit_shape, hidden_size)
+
+        # If masked, return [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        if bool_masked_pos is not None:
+            return hidden_states
+
+        # If not masked, we can return [batch_size, height, width, hidden_size]
+        hidden_states = undo_windowing(hidden_states, size, mask_unit_shape)
+
+        return hidden_states
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+            reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, bool_masked_pos=bool_masked_pos)
+            all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
+
+        for i, stage_module in enumerate(self.stages):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = stage_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+                reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, bool_masked_pos=bool_masked_pos)
+                all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions, all_reshaped_hidden_states]
+                if v is not None
+            )
+        return HieraEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+def unroll(
+    hidden_states: torch.Tensor, image_shape: tuple[int, int], patch_stride: tuple[int, int], schedule: list[list[int]]
+) -> torch.Tensor:
+    """
+    Reorders the tokens such that patches are contiguous in memory.
+    E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
+    [batch_size, (stride, stride, height // stride, width // stride), hidden_size]
+
+    This allows operations like Max2d to be computed as x.view(batch_size, stride*stride, -1, hidden_size).max(dim=1).
+    Not only is this faster, but it also makes it easy to support inputs of arbitrary
+    dimensions in addition to patch-wise sparsity.
+
+    Performing this operation multiple times in sequence puts entire windows as contiguous
+    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
+    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
+    computed easily and efficiently, while also allowing max to be applied sequentially.
+
+    Note: This means that intermediate values of the model are not in height x width order, so they
+    need to be re-rolled if you want to use the intermediate values as a height x width feature map.
+    The last block of the network is fine though, since by then the strides are all consumed.
+    """
+    batch_size, _, hidden_size = hidden_states.shape
+
+    size = [i // s for i, s in zip(image_shape, patch_stride)]
+
+    current_size = size
+    hidden_states = hidden_states.view(*([batch_size] + current_size + [hidden_size]))
+
+    for strides in schedule:
+        # Move patches with the given strides to the batch dimension
+
+        # Create a view of the tensor with the patch stride as separate dims
+        # For example in 2d: [batch_size, height // stride, stride, width // stride, stride, C]
+        current_size = [i // s for i, s in zip(current_size, strides)]
+        # initialize new_shape with [height // stride, stride, width // stride, stride]
+        new_shape = [item for pair in zip(current_size, strides) for item in pair]
+        # add batch_size and hidden_size to new_shape
+        new_shape = [batch_size] + new_shape + [hidden_size]
+        hidden_states = hidden_states.view(new_shape)
+
+        # Move the patch stride into the batch dimension
+        # For example in 2d: [batch_size, stride, stride, height // stride, width // stride, hidden_size]
+        num_dims = len(new_shape)
+        permute = [0] + list(range(2, num_dims - 1, 2)) + list(range(1, num_dims - 1, 2)) + [num_dims - 1]
+        hidden_states = hidden_states.permute(permute)
+
+        # Now finally flatten the relevant dims into the batch dimension
+        hidden_states = hidden_states.flatten(0, len(strides))
+        batch_size *= math.prod(strides)
+
+    hidden_states = hidden_states.reshape(-1, math.prod(size), hidden_size)
+    return hidden_states
+
+
+@auto_docstring
+class HieraPreTrainedModel(PreTrainedModel):
+    config: HieraConfig
+    base_model_prefix = "hiera"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module) -> None:
+        """Initialize the weights"""
+        std = self.config.initializer_range
+
+        if isinstance(module, HieraEmbeddings):
+            nn.init.trunc_normal_(module.position_embeddings, std=std)
+
+        elif isinstance(module, HieraDecoder):
+            nn.init.trunc_normal_(module.mask_token, std=std)
+            nn.init.trunc_normal_(module.decoder_position_embeddings, std=std)
+
+        elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
+            nn.init.trunc_normal_(module.weight, std=std)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, std)
+
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.bias, std)
+            nn.init.constant_(module.weight, self.config.layer_norm_init)
+
+
+class HieraPooler(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+        self.layernorm = nn.LayerNorm(num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.transpose(1, 2)
+        pooled_output = self.pooler(hidden_states)
+        pooled_output = torch.flatten(pooled_output, 1)
+        pooled_output = self.layernorm(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class HieraModel(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, is_mae: bool = False):
+        r"""
+        add_pooling_layer (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply pooling layer.
+        is_mae (`bool`, *optional*, defaults to `False`):
+            Whether or not to run the model on MAE mode.
+        """
+        super().__init__(config)
+        self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+
+        self.embeddings = HieraEmbeddings(config, is_mae=is_mae)
+        self.encoder = HieraEncoder(config)
+
+        self.unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+
+        self.pooler = HieraPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> HieraPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
+            Mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+
+        embedding_output, bool_masked_pos, ids_restore = self.embeddings(
+            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, noise=noise
+        )
+
+        image_shape = (pixel_values.shape[-2], pixel_values.shape[-1])
+        hidden_states = unroll(
+            embedding_output,
+            image_shape=image_shape,
+            patch_stride=self.config.patch_stride,
+            schedule=self.unroll_schedule,
+        )
+
+        # Discard masked tokens if bool_masked_pos is provided
+        if bool_masked_pos is not None:
+            mask_unit_area = math.prod(self.config.masked_unit_size)
+            batch_size, _, hidden_size = hidden_states.shape
+            positions = bool_masked_pos.unsqueeze(-1).tile(1, mask_unit_area, hidden_size)
+            hidden_states = hidden_states[positions]
+            hidden_states = hidden_states.view(batch_size, -1, hidden_size)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            head_outputs = (
+                head_outputs + (bool_masked_pos, ids_restore) if bool_masked_pos is not None else head_outputs
+            )
+            return head_outputs + encoder_outputs[1:]
+
+        return HieraModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            bool_masked_pos=bool_masked_pos,
+            ids_restore=ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+class HieraDecoder(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+        tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        self.tokens_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(tokens_spatial_shape, config.query_stride)
+        ]
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
+        ]
+
+        self.decoder_embeddings = nn.Linear(num_features, config.decoder_hidden_size)
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+
+        self.decoder_position_embeddings = nn.Parameter(
+            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), config.decoder_hidden_size)
+        )
+
+        self.decoder_block = HieraStage(
+            config=config,
+            hidden_size=config.decoder_hidden_size,
+            hidden_size_output=config.decoder_hidden_size,
+            num_heads=config.decoder_num_heads,
+            depth=config.decoder_depth,
+            use_mask_unit_attn=False,
+            drop_path=[0.0] * config.decoder_depth,
+            query_stride=[1] * config.decoder_depth,
+            window_size=0,
+        )
+
+        self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
+
+        # patch stride of prediction
+        self.pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool)
+        pred_dim = (self.pred_stride ** len(config.query_stride)) * config.num_channels
+
+        self.decoder_pred = nn.Linear(config.decoder_hidden_size, pred_dim)
+
+    def forward(
+        self,
+        encoder_hidden_states: torch.Tensor,
+        bool_masked_pos: torch.BoolTensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, torch.BoolTensor]:
+        # Embed tokens
+        hidden_states = self.decoder_embeddings(encoder_hidden_states)
+
+        # Combine visible and bool_masked_pos tokens
+
+        # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_hidden_size]
+        # bool_masked_pos: [batch_size, num_mask_units]
+        mask_unit_height, mask_unit_width, decoder_hidden_size = hidden_states.shape[2:]
+        batch_size, num_mask_units = bool_masked_pos.shape
+
+        decoder_hidden_states = torch.zeros(
+            batch_size,
+            num_mask_units,
+            mask_unit_height,
+            mask_unit_width,
+            decoder_hidden_size,
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+        mask_tokens = self.mask_token.view(1, 1, 1, 1, -1)
+        bool_masked_pos = bool_masked_pos.reshape(batch_size, num_mask_units, 1, 1, 1)
+        bool_masked_pos = bool_masked_pos.expand(-1, -1, mask_unit_height, mask_unit_width, decoder_hidden_size)
+        decoder_hidden_states[bool_masked_pos] = hidden_states.flatten()
+        decoder_hidden_states = (
+            1 - bool_masked_pos.float()
+        ) * mask_tokens + bool_masked_pos.float() * decoder_hidden_states
+
+        # Get back spatial order
+        hidden_states = undo_windowing(
+            decoder_hidden_states,
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+        bool_masked_pos = undo_windowing(
+            bool_masked_pos[..., 0:1],
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+
+        # Flatten
+        hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1])
+        bool_masked_pos = bool_masked_pos.view(hidden_states.shape[0], -1)
+
+        # Add pos embed
+        hidden_states = hidden_states + self.decoder_position_embeddings
+
+        # Apply decoder blocks
+        hidden_states, attn_weights = self.decoder_block(
+            hidden_states, head_mask=head_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.decoder_norm(hidden_states)
+
+        # Predictor projection
+        hidden_states = self.decoder_pred(hidden_states)
+
+        return hidden_states, bool_masked_pos
+
+
+class HieraMultiScaleHead(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
+        ]
+        self.stage_dimensions = [
+            int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
+        ]
+        current_masked_unit_size = config.masked_unit_size
+        self.multi_scale_fusion_heads = nn.ModuleList()
+
+        for idx in range(config.num_query_pool):
+            kernel = [i // s for i, s in zip(current_masked_unit_size, self.mask_unit_spatial_shape_final)]
+            current_masked_unit_size = [i // s for i, s in zip(current_masked_unit_size, config.query_stride)]
+            self.multi_scale_fusion_heads.append(
+                nn.Conv2d(
+                    self.stage_dimensions[idx],
+                    self.stage_dimensions[-1],
+                    kernel_size=kernel,
+                    stride=kernel,
+                )
+            )
+        self.multi_scale_fusion_heads.append(nn.Identity())
+
+    def apply_fusion_head(self, head: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor:
+        if isinstance(head, nn.Identity):
+            return hidden_states
+
+        # Doing explicit to avoid problems with torch.fx
+        batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size = hidden_states.shape
+        # From: [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        # To: head([batch_size * num_mask_units, hidden_size, mask_unit_height, mask_unit_width])
+        hidden_states = hidden_states.reshape(
+            batch_size * num_mask_units, mask_unit_height, mask_unit_width, hidden_size
+        )
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = head(hidden_states)
+
+        # Restore original layout
+        hidden_states = hidden_states.permute(0, 2, 3, 1)
+        mask_unit_height_final, mask_unit_width_final, hidden_size = hidden_states.shape[1:]
+        hidden_states = hidden_states.reshape(
+            batch_size, num_mask_units, mask_unit_height_final, mask_unit_width_final, hidden_size
+        )
+
+        return hidden_states
+
+    def forward(self, feature_maps: list[torch.Tensor]) -> torch.Tensor:
+        # Multi-scale fusion
+        hidden_states = 0.0
+        for head, feature_map in zip(self.multi_scale_fusion_heads, feature_maps):
+            hidden_states = hidden_states + self.apply_fusion_head(head, feature_map)
+
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The Hiera Model transformer with the decoder on top for self-supervised pre-training.
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+    """
+)
+class HieraForPreTraining(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__(config)
+        # Encoder
+        self.hiera = HieraModel(config, add_pooling_layer=False, is_mae=True)
+        self.encoder_norm = nn.LayerNorm(self.hiera.num_features, eps=config.layer_norm_eps)
+        # Multi-scale fusion heads
+        self.multiscale_fusion = HieraMultiScaleHead(config)
+        # Decoder
+        self.decoder = HieraDecoder(config)
+        self.pred_stride = self.decoder.pred_stride
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_pixel_label_2d(self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor) -> torch.Tensor:
+        # bool_masked_pos (boolean tensor): True means *masked*
+        pixel_values = pixel_values.permute(0, 2, 3, 1)
+
+        size = self.pred_stride
+        label = pixel_values.unfold(1, size, size).unfold(2, size, size)
+        label = label.flatten(1, 2).flatten(2)
+        label = label[bool_masked_pos]
+        if self.config.normalize_pixel_loss:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
+    def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, bool_masked_pos: torch.BoolTensor):
+        # We invert the bool_masked_pos such that 1.0 is *masked*
+        bool_masked_pos = ~bool_masked_pos
+        label = self.get_pixel_label_2d(pixel_values, bool_masked_pos)
+
+        logits = logits[bool_masked_pos]
+        loss = (logits - label) ** 2
+        loss = loss.mean()
+
+        return loss
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, HieraForPreTrainingOutput]:
+        r"""
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
+            Mainly used for testing purposes to control randomness and maintain the reproducibility
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, HieraForPreTraining
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-mae-hf")
+        >>> model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> loss = outputs.loss
+        >>> print(list(logits.shape))
+        [1, 196, 768]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.hiera(
+            pixel_values,
+            noise=noise,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        feature_maps = outputs[-1]
+        bool_masked_pos = outputs[1]
+        ids_to_restore = outputs[2]
+        # Take only the query pooled and last hidden states
+        feature_maps = feature_maps[1 : self.hiera.config.num_query_pool + 1] + (feature_maps[-1],)
+        fused_hidden_states = self.multiscale_fusion(feature_maps)
+        fused_hidden_states = self.encoder_norm(fused_hidden_states)
+
+        # Reconstruct pixel values
+        logits, bool_masked_pos = self.decoder(
+            fused_hidden_states,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+
+        loss = self.forward_loss(pixel_values, logits, bool_masked_pos)
+
+        if not return_dict:
+            output = (logits, bool_masked_pos, ids_to_restore)
+            if output_hidden_states:
+                output = output + (outputs[3],)
+            if output_attentions:
+                output = output + (outputs[4],)
+            if output_hidden_states:
+                output = output + (outputs[-1],)
+            return ((loss,) + output) if loss is not None else output
+
+        return HieraForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            bool_masked_pos=bool_masked_pos,
+            ids_restore=ids_to_restore,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states if output_hidden_states else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state with
+    average pooling) e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune Hiera on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """
+)
+class HieraForImageClassification(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.hiera = HieraModel(config, add_pooling_layer=True, is_mae=False)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.hiera.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, HieraForImageClassificationOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.hiera(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return HieraForImageClassificationOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Hiera backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class HieraBackbone(HieraPreTrainedModel, BackboneMixin):
+    def __init__(self, config: HieraConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.embed_dim] + [
+            int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
+        ]
+        self.embeddings = HieraEmbeddings(config, is_mae=False)
+        self.encoder = HieraEncoder(config)
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
+            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-hf")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/hiera-tiny-224-hf", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output, _, _ = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            head_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[-1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                batch_size, height, width, num_channels = hidden_state.shape
+                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs[1],)
+            if output_attentions:
+                output += (outputs[2],)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs[1] if output_hidden_states else None,
+            attentions=outputs[2] if output_attentions else None,
+        )
+
+
+__all__ = ["HieraForImageClassification", "HieraForPreTraining", "HieraBackbone", "HieraModel", "HieraPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/__init__.py b/phivenv/Lib/site-packages/transformers/models/hubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d975dabc689a73c83818ced8bed5ad86072df9b2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hubert/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hubert import *
+    from .modeling_hubert import *
+    from .modeling_tf_hubert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49cb7d9629896d8246c22242e7115e3b1944fb56
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1b08e2ee5cba06bd9e87089219cd21f6910308f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57161cd343767601edbc5a5a2922743abe7cd866
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..020bda0df2645b7cd256b4d083f6466a286b0816
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01858b090bfe9ff98b878350db66c9e57930dfe7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/configuration_hubert.py b/phivenv/Lib/site-packages/transformers/models/hubert/configuration_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8af7b5a0f3cb712bb112bb5d7144ea0d9da29e0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hubert/configuration_hubert.py
@@ -0,0 +1,265 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Hubert model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class HubertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HubertModel`]. It is used to instantiate an
+    Hubert model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Hubert
+    [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the Hubert model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HubertModel`]. Vocabulary size of the model. Defines the different
+            tokens that can be represented by the *inputs_ids* passed to the forward method of [`HubertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout(`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout(`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_proj_layer_norm (`bool`, *optional*, defaults to `True`):
+            Whether to apply LayerNorm to the output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        conv_pos_batch_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use batch norm instead of weight norm in conv_pos
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether do apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is
+            True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
+            False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://huggingface.co/papers/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`HubertForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`HubertForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`HubertForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+
+    Example:
+
+    ```python
+    >>> from transformers import HubertModel, HubertConfig
+
+    >>> # Initializing a Hubert facebook/hubert-base-ls960 style configuration
+    >>> configuration = HubertConfig()
+
+    >>> # Initializing a model from the facebook/hubert-base-ls960 style configuration
+    >>> model = HubertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "hubert"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_layer_norm=True,
+        feat_proj_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        conv_pos_batch_norm=False,
+        do_stable_layer_norm=False,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.conv_pos_batch_norm = conv_pos_batch_norm
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_layer_norm = feat_proj_layer_norm
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.classifier_proj_size = classifier_proj_size
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+__all__ = ["HubertConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/modeling_hubert.py b/phivenv/Lib/site-packages/transformers/models/hubert/modeling_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..060b715e8d499a13906092133dab2bdddae216df
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hubert/modeling_hubert.py
@@ -0,0 +1,1285 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hubert/modular_hubert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hubert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
+from .configuration_hubert import HubertConfig
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+class HubertPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.batch_norm = None
+        if config.conv_pos_batch_norm:
+            self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        else:
+            weight_norm = nn.utils.weight_norm
+            if hasattr(nn.utils.parametrizations, "weight_norm"):
+                weight_norm = nn.utils.parametrizations.weight_norm
+
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                    self.conv = weight_norm(self.conv, name="weight", dim=2)
+                if hasattr(self.conv, "parametrizations"):
+                    weight_g = self.conv.parametrizations.weight.original0
+                    weight_v = self.conv.parametrizations.weight.original1
+                else:
+                    weight_g = self.conv.weight_g
+                    weight_v = self.conv.weight_v
+                deepspeed.zero.register_external_parameter(self, weight_v)
+                deepspeed.zero.register_external_parameter(self, weight_g)
+            else:
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        if self.batch_norm is not None:
+            hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class HubertSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class HubertNoLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class HubertLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class HubertGroupNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class HubertFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [HubertGroupNormConvLayer(config, layer_id=0)] + [
+                HubertNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [HubertLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class HubertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.feat_proj_layer_norm = config.feat_proj_layer_norm
+        if self.feat_proj_layer_norm:
+            self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        if self.feat_proj_layer_norm:
+            hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class HubertAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[HubertConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2)
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2)
+        value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights, None
+
+
+class HubertFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class HubertEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = HubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            config=config,
+        )
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = HubertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class HubertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = HubertPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([HubertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+class HubertAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+class HubertEncoderLayerStableLayerNorm(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = HubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            config=config,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = HubertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = HubertAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class HubertEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = HubertPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [HubertEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                layer_outputs = layer(
+                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+@auto_docstring
+class HubertPreTrainedModel(PreTrainedModel):
+    config: HubertConfig
+    base_model_prefix = "hubert"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
+                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+                else:
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+            else:
+                nn.init.kaiming_normal_(module.weight.data)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, HubertModel):
+            if hasattr(module, "masked_spec_embed"):
+                module.masked_spec_embed.data.uniform_()
+        elif isinstance(module, HubertForSequenceClassification):
+            if hasattr(module, "layer_weights"):
+                module.layer_weights.data.fill_(1.0 / (self.config.num_hidden_layers + 1))
+
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+def _compute_mask_indices(
+    shape: tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+@auto_docstring
+class HubertModel(HubertPreTrainedModel):
+    def __init__(self, config: HubertConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = HubertFeatureEncoder(config)
+        self.feature_projection = HubertFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = HubertEncoderStableLayerNorm(config)
+        else:
+            self.encoder = HubertEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://huggingface.co/papers/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, HubertModel
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+
+
+        >>> def map_to_array(example):
+        ...     example["speech"] = example["audio"]["array"]
+        ...     return example
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 1
+
+
+@auto_docstring(
+    custom_intro="""
+    Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+    """
+)
+class HubertForCTC(HubertPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`HubertForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+
+        self.hubert = HubertModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for Hubert so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, Hubert never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.hubert.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.hubert.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.hubert(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """
+)
+class HubertForSequenceClassification(HubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)"
+            )
+        self.hubert = HubertModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.hubert.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.hubert.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.hubert(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["HubertForCTC", "HubertForSequenceClassification", "HubertModel", "HubertPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/modeling_tf_hubert.py b/phivenv/Lib/site-packages/transformers/models/hubert/modeling_tf_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b462692a593f487fcc6e1b3f6b67cf91df050c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hubert/modeling_tf_hubert.py
@@ -0,0 +1,1671 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow Hubert model."""
+
+from __future__ import annotations
+
+import warnings
+from typing import Any
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_hubert import HubertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "HubertConfig"
+
+
+LARGE_NEGATIVE = -1e8
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2._sample_without_replacement
+def _sample_without_replacement(distribution, num_samples):
+    """
+    Categorical sampling without replacement is currently not implemented. The gumbel-max trick will do for now - see
+    https://github.com/tensorflow/tensorflow/issues/9260 for more info
+    """
+    z = -tf.math.log(tf.random.uniform(shape_list(distribution), 0, 1))
+    _, indices = tf.nn.top_k(distribution + z, num_samples)
+    return indices
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2._scatter_values_on_batch_indices
+def _scatter_values_on_batch_indices(values, batch_indices, output_shape):
+    """
+    Scatter function as in PyTorch with indices in format (batch_dim, indixes)
+    """
+    indices_shape = shape_list(batch_indices)
+    # broadcast batch dim to indices_shape
+    broad_casted_batch_dims = tf.reshape(
+        tf.broadcast_to(tf.expand_dims(tf.range(indices_shape[0]), axis=-1), indices_shape), [1, -1]
+    )
+    # transform batch_indices to pair_indices
+    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
+    # scatter values to pair indices
+    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), output_shape)
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    min_masks: int = 0,
+) -> tf.Tensor:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob:
+            probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+
+    Adapted from [fairseq's
+    data_utils.py](https://github.com/pytorch/fairseq/blob/e0788f7007a8473a76db573985031f3c94201e79/fairseq/data/data_utils.py#L376).
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    tf.debugging.assert_less(
+        mask_length,
+        sequence_length,
+        message=(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"
+            f" `sequence_length`: {sequence_length}`"
+        ),
+    )
+
+    # compute number of masked spans in batch
+    num_masked_spans = mask_prob * tf.cast(sequence_length, tf.float32) / mask_length + tf.random.uniform((1,))
+    num_masked_spans = tf.maximum(num_masked_spans, min_masks)
+    num_masked_spans = tf.cast(num_masked_spans, tf.int32)
+
+    # make sure num masked indices <= sequence_length
+    num_masked_spans = tf.math.minimum(sequence_length // mask_length, num_masked_spans)
+    num_masked_spans = tf.squeeze(num_masked_spans)
+
+    # SpecAugment mask to fill
+    spec_aug_mask = tf.zeros((batch_size, sequence_length), dtype=tf.int32)
+
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = tf.ones((batch_size, sequence_length - (mask_length - 1)))
+
+    # get random indices to mask
+    spec_aug_mask_idxs = _sample_without_replacement(uniform_dist, num_masked_spans)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = tf.expand_dims(spec_aug_mask_idxs, -1)
+    spec_aug_mask_idxs = tf.tile(spec_aug_mask_idxs, (1, 1, mask_length))
+    spec_aug_mask_idxs = tf.reshape(spec_aug_mask_idxs, (batch_size, num_masked_spans * mask_length))
+
+    offsets = tf.range(mask_length)[tf.newaxis, tf.newaxis, :]
+    offsets = tf.tile(offsets, (batch_size, num_masked_spans, 1))
+    offsets = tf.reshape(offsets, (batch_size, num_masked_spans * mask_length))
+
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # scatter indices to mask
+    spec_aug_mask = _scatter_values_on_batch_indices(
+        tf.ones_like(spec_aug_mask_idxs), spec_aug_mask_idxs, tf.shape(spec_aug_mask)
+    )
+
+    return spec_aug_mask
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: int | None = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNorm with Wav2Vec2->Hubert
+class TFHubertGroupNorm(keras.layers.Layer):
+    """
+    From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization
+    """
+
+    def __init__(
+        self,
+        groups: int = 32,
+        axis: int = -1,
+        epsilon: float = 1e-3,
+        center: bool = True,
+        scale: bool = True,
+        beta_initializer: keras.initializers.Initializer = "zeros",
+        gamma_initializer: keras.initializers.Initializer = "ones",
+        beta_regularizer: keras.regularizers.Regularizer = None,
+        gamma_regularizer: keras.regularizers.Regularizer = None,
+        beta_constraint: keras.constraints.Constraint = None,
+        gamma_constraint: keras.constraints.Constraint = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.groups = groups
+        self.axis = axis
+        self.epsilon = epsilon
+        self.center = center
+        self.scale = scale
+        self.beta_initializer = keras.initializers.get(beta_initializer)
+        self.gamma_initializer = keras.initializers.get(gamma_initializer)
+        self.beta_regularizer = keras.regularizers.get(beta_regularizer)
+        self.gamma_regularizer = keras.regularizers.get(gamma_regularizer)
+        self.beta_constraint = keras.constraints.get(beta_constraint)
+        self.gamma_constraint = keras.constraints.get(gamma_constraint)
+        self._check_axis()
+
+    def build(self, input_shape):
+        self._check_if_input_shape_is_none(input_shape)
+        self._set_number_of_groups_for_instance_norm(input_shape)
+        self._check_size_of_dimensions(input_shape)
+        self._create_input_spec(input_shape)
+
+        self._add_gamma_weight(input_shape)
+        self._add_beta_weight(input_shape)
+        self.built = True
+        super().build(input_shape)
+
+    def call(self, inputs):
+        input_shape = keras.backend.int_shape(inputs)
+        tensor_input_shape = tf.shape(inputs)
+
+        reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape)
+
+        normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)
+
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            outputs = tf.reshape(normalized_inputs, tensor_input_shape)
+        else:
+            outputs = normalized_inputs
+
+        return outputs
+
+    def get_config(self):
+        config = {
+            "groups": self.groups,
+            "axis": self.axis,
+            "epsilon": self.epsilon,
+            "center": self.center,
+            "scale": self.scale,
+            "beta_initializer": keras.initializers.serialize(self.beta_initializer),
+            "gamma_initializer": keras.initializers.serialize(self.gamma_initializer),
+            "beta_regularizer": keras.regularizers.serialize(self.beta_regularizer),
+            "gamma_regularizer": keras.regularizers.serialize(self.gamma_regularizer),
+            "beta_constraint": keras.constraints.serialize(self.beta_constraint),
+            "gamma_constraint": keras.constraints.serialize(self.gamma_constraint),
+        }
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
+        group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            group_shape[self.axis] = input_shape[self.axis] // self.groups
+            group_shape.insert(self.axis, self.groups)
+            group_shape = tf.stack(group_shape)
+            reshaped_inputs = tf.reshape(inputs, group_shape)
+            return reshaped_inputs, group_shape
+        else:
+            return inputs, group_shape
+
+    def _apply_normalization(self, reshaped_inputs, input_shape):
+        group_shape = keras.backend.int_shape(reshaped_inputs)
+        group_reduction_axes = list(range(1, len(group_shape)))
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            axis = -2 if self.axis == -1 else self.axis - 1
+        else:
+            axis = -1 if self.axis == -1 else self.axis - 1
+        group_reduction_axes.pop(axis)
+
+        mean, variance = tf.nn.moments(reshaped_inputs, group_reduction_axes, keepdims=True)
+
+        gamma, beta = self._get_reshaped_weights(input_shape)
+        normalized_inputs = tf.nn.batch_normalization(
+            reshaped_inputs,
+            mean=mean,
+            variance=variance,
+            scale=gamma,
+            offset=beta,
+            variance_epsilon=self.epsilon,
+        )
+        return normalized_inputs
+
+    def _get_reshaped_weights(self, input_shape):
+        broadcast_shape = self._create_broadcast_shape(input_shape)
+        gamma = None
+        beta = None
+        if self.scale:
+            gamma = tf.reshape(self.gamma, broadcast_shape)
+
+        if self.center:
+            beta = tf.reshape(self.beta, broadcast_shape)
+        return gamma, beta
+
+    def _check_if_input_shape_is_none(self, input_shape):
+        dim = input_shape[self.axis]
+        if dim is None:
+            raise ValueError(
+                "Axis "
+                + str(self.axis)
+                + " of input tensor should have a defined dimension but the layer received an input with shape "
+                + str(input_shape)
+                + "."
+            )
+
+    def _set_number_of_groups_for_instance_norm(self, input_shape):
+        dim = input_shape[self.axis]
+
+        if self.groups == -1:
+            self.groups = dim
+
+    def _check_size_of_dimensions(self, input_shape):
+        dim = input_shape[self.axis]
+        if dim < self.groups:
+            raise ValueError(
+                "Number of groups ("
+                + str(self.groups)
+                + ") cannot be more than the number of channels ("
+                + str(dim)
+                + ")."
+            )
+
+        if dim % self.groups != 0:
+            raise ValueError(
+                "Number of groups ("
+                + str(self.groups)
+                + ") must be a multiple of the number of channels ("
+                + str(dim)
+                + ")."
+            )
+
+    def _check_axis(self):
+        if self.axis == 0:
+            raise ValueError(
+                "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
+            )
+
+    def _create_input_spec(self, input_shape):
+        dim = input_shape[self.axis]
+        self.input_spec = keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
+
+    def _add_gamma_weight(self, input_shape):
+        dim = input_shape[self.axis]
+        shape = (dim,)
+
+        if self.scale:
+            self.gamma = self.add_weight(
+                shape=shape,
+                name="gamma",
+                initializer=self.gamma_initializer,
+                regularizer=self.gamma_regularizer,
+                constraint=self.gamma_constraint,
+            )
+        else:
+            self.gamma = None
+
+    def _add_beta_weight(self, input_shape):
+        dim = input_shape[self.axis]
+        shape = (dim,)
+
+        if self.center:
+            self.beta = self.add_weight(
+                shape=shape,
+                name="beta",
+                initializer=self.beta_initializer,
+                regularizer=self.beta_regularizer,
+                constraint=self.beta_constraint,
+            )
+        else:
+            self.beta = None
+
+    def _create_broadcast_shape(self, input_shape):
+        broadcast_shape = [1] * len(input_shape)
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
+            broadcast_shape.insert(self.axis, self.groups)
+        else:
+            broadcast_shape[self.axis] = self.groups
+        return broadcast_shape
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2WeightNormConv1D with Wav2Vec2->Hubert
+class TFHubertWeightNormConv1D(keras.layers.Conv1D):
+    """Adapted from https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNorm"""
+
+    def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            groups=groups,
+            padding="valid",
+            use_bias=True,
+            bias_initializer="he_normal",
+            **kwargs,
+        )
+        self.explicit_padding = explicit_padding
+        self.filter_axis = 2
+        self.kernel_norm_axes = tf.constant([0, 1])
+
+    def _init_norm(self):
+        """Set the norm of the weight vector."""
+        kernel_norm = tf.sqrt(tf.reduce_sum(tf.square(self.weight_v), axis=self.kernel_norm_axes))
+        self.weight_g.assign(kernel_norm[:, tf.newaxis, tf.newaxis])
+
+    def _normalize_kernel(self):
+        """Generate normalized weights."""
+        kernel = tf.nn.l2_normalize(self.weight_v, axis=self.kernel_norm_axes) * tf.transpose(self.weight_g)
+        self.kernel = tf.transpose(kernel)
+
+    def build(self, input_shape):
+        if not self.built:
+            super().build(input_shape)
+
+            self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
+            self.weight_v = self.kernel
+
+            self.weight_g = self.add_weight(
+                name="weight_g",
+                shape=(int(self.weight_v.shape[self.filter_axis]), 1, 1),
+                initializer="ones",
+                dtype=self.weight_v.dtype,
+                trainable=True,
+            )
+            self._init_norm()
+            self.bias = self.add_weight(name="bias", shape=(self.filters,), initializer="zeros", trainable=True)
+
+    def call(self, inputs):
+        # TODO Matt: Assigning to attributes in call() is deeply sinful in TensorFlow, as it should be idempotent.
+        #            This whole layer should be replaced by a layer that doesn't inherit from Conv1D, but instead calls
+        #            a functional 1d convolution with normalized weights that it generates (but does not store!)
+        self._normalize_kernel()
+
+        padded_inputs = tf.pad(inputs, ((0, 0), (self.explicit_padding, self.explicit_padding), (0, 0)))
+        output = super().call(padded_inputs)
+
+        return output
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2NoLayerNormConvLayer with Wav2Vec2->Hubert
+class TFHubertNoLayerNormConvLayer(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = keras.layers.Conv1D(
+            filters=self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            strides=config.conv_stride[layer_id],
+            use_bias=config.conv_bias,
+            name="conv",
+        )
+        self.activation = get_tf_activation(config.feat_extract_activation)
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.in_conv_dim])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2LayerNormConvLayer with Wav2Vec2->Hubert
+class TFHubertLayerNormConvLayer(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = keras.layers.Conv1D(
+            filters=self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            strides=config.conv_stride[layer_id],
+            use_bias=config.conv_bias,
+            name="conv",
+        )
+        self.layer_norm = keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps)
+        self.activation = get_tf_activation(config.feat_extract_activation)
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.in_conv_dim])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.out_conv_dim])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNormConvLayer with Wav2Vec2->Hubert
+class TFHubertGroupNormConvLayer(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = keras.layers.Conv1D(
+            filters=self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            strides=config.conv_stride[layer_id],
+            use_bias=config.conv_bias,
+            name="conv",
+        )
+        self.activation = get_tf_activation(config.feat_extract_activation)
+        self.layer_norm = TFHubertGroupNorm(groups=self.out_conv_dim, epsilon=config.layer_norm_eps, name="layer_norm")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.in_conv_dim])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.out_conv_dim])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2PositionalConvEmbedding with Wav2Vec2->Hubert
+class TFHubertPositionalConvEmbedding(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.conv = TFHubertWeightNormConv1D(
+            filters=config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            groups=config.num_conv_pos_embedding_groups,
+            explicit_padding=config.num_conv_pos_embeddings // 2,
+            name="conv",
+        )
+        self.padding = TFHubertSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = get_tf_activation(config.feat_extract_activation)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2SamePadLayer with Wav2Vec2->Hubert
+class TFHubertSamePadLayer(keras.layers.Layer):
+    def __init__(self, num_conv_pos_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def call(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, : -self.num_pad_remove, :]
+        return hidden_states
+
+
+class TFHubertFeatureEncoder(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [TFHubertGroupNormConvLayer(config, layer_id=0, name=f"conv_layers.{0}")] + [
+                TFHubertNoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i + 1}")
+                for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                TFHubertLayerNormConvLayer(config, layer_id=i, name=f"conv_layers.{i}")
+                for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = conv_layers
+
+    def call(self, input_values):
+        hidden_states = tf.expand_dims(input_values, -1)
+        for conv_layer in self.conv_layers:
+            hidden_states = conv_layer(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        for conv_layer in self.conv_layers:
+            with tf.name_scope(conv_layer.name):
+                conv_layer.build(None)
+
+
+class TFHubertFeatureExtractor(TFHubertFeatureEncoder):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+
+
+class TFHubertFeatureProjection(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.projection = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="projection",
+        )
+        self.dropout = keras.layers.Dropout(rate=config.feat_proj_dropout)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.conv_dim[-1]])
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, self.config.conv_dim[-1]])
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFHubert
+class TFHubertAttention(keras.layers.Layer):
+    """Multi-headed attention from "Attention Is All You Need"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = keras.layers.Dropout(dropout)
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
+        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
+        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
+        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: tuple[tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> tuple[tf.Tensor, tf.Tensor | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {shape_list(attn_weights)}"
+            ),
+        )
+
+        if attention_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask),
+                [bsz, 1, tgt_len, src_len],
+                message=(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {shape_list(attention_mask)}"
+                ),
+            )
+
+            attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype)
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+
+            attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
+                attn_weights, (bsz, self.num_heads, tgt_len, src_len)
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_probs = self.dropout(attn_weights, training=training)
+        attn_output = tf.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {shape_list(attn_output)}"
+            ),
+        )
+
+        attn_output = tf.transpose(
+            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+        )
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2FeedForward with Wav2Vec2->Hubert
+class TFHubertFeedForward(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.intermediate_dropout = keras.layers.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = keras.layers.Dense(
+            units=config.intermediate_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="intermediate_dense",
+        )
+        self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+
+        self.output_dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="output_dense",
+        )
+        self.output_dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states, training=training)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "intermediate_dense", None) is not None:
+            with tf.name_scope(self.intermediate_dense.name):
+                self.intermediate_dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "output_dense", None) is not None:
+            with tf.name_scope(self.output_dense.name):
+                self.output_dense.build([None, None, self.config.intermediate_size])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayer with Wav2Vec2->Hubert
+class TFHubertEncoderLayer(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFHubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            name="attention",
+        )
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = False,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, training=training
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "feed_forward", None) is not None:
+            with tf.name_scope(self.feed_forward.name):
+                self.feed_forward.build(None)
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->Hubert
+class TFHubertEncoderLayerStableLayerNorm(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFHubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            name="attention",
+        )
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = False,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, training=training
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "feed_forward", None) is not None:
+            with tf.name_scope(self.feed_forward.name):
+                self.feed_forward.build(None)
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2Encoder with Wav2Vec2->Hubert
+class TFHubertEncoder(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer = [TFHubertEncoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = False,
+        output_hidden_states: bool | None = False,
+        return_dict: bool | None = True,
+        training: bool | None = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
+            attention_mask = _expand_mask(attention_mask)
+        else:
+            attention_mask = None
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+            if training and (dropout_probability < self.config.layerdrop):  # skip the layer
+                continue
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "pos_conv_embed", None) is not None:
+            with tf.name_scope(self.pos_conv_embed.name):
+                self.pos_conv_embed.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderStableLayerNorm with Wav2Vec2->Hubert
+class TFHubertEncoderStableLayerNorm(keras.layers.Layer):
+    def __init__(self, config: HubertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer = [
+            TFHubertEncoderLayerStableLayerNorm(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)
+        ]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = False,
+        output_hidden_states: bool | None = False,
+        return_dict: bool | None = True,
+        training: bool | None = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
+            attention_mask = _expand_mask(attention_mask)
+        else:
+            attention_mask = None
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+            if training and (dropout_probability < self.config.layerdrop):  # skip the layer
+                continue
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "pos_conv_embed", None) is not None:
+            with tf.name_scope(self.pos_conv_embed.name):
+                self.pos_conv_embed.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFHubertMainLayer(keras.layers.Layer):
+    config_class = HubertConfig
+
+    def __init__(self, config: HubertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor")
+        self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")
+
+        if config.do_stable_layer_norm:
+            self.encoder = TFHubertEncoderStableLayerNorm(config, name="encoder")
+        else:
+            self.encoder = TFHubertEncoder(config, name="encoder")
+
+    def build(self, input_shape=None):
+        self.masked_spec_embed = self.add_weight(
+            shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed"
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "feature_extractor", None) is not None:
+            with tf.name_scope(self.feature_extractor.name):
+                self.feature_extractor.build(None)
+        if getattr(self, "feature_projection", None) is not None:
+            with tf.name_scope(self.feature_projection.name):
+                self.feature_projection.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://huggingface.co/papers/1904.08779).
+        """
+        batch_size, sequence_length, hidden_size = shape_list(hidden_states)
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states = tf.where(
+                tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
+                self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
+                hidden_states,
+            )
+
+        elif self.config.mask_time_prob > 0:
+            # generate indices & apply SpecAugment along time axis
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                min_masks=2,
+            )
+            hidden_states = tf.where(
+                tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
+                self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
+                hidden_states,
+            )
+
+        # apply SpecAugment along feature axis
+        if self.config.mask_feature_prob > 0:
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+            )
+            hidden_states = tf.where(mask_feature_indices[:, tf.newaxis, :], hidden_states, 0)
+
+        return hidden_states
+
+    @unpack_inputs
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: tf.Tensor | None = None,
+        output_hidden_states: tf.Tensor | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs: Any,
+    ):
+        hidden_states = self.feature_extractor(tf.cast(input_values, tf.float32), training=training)
+
+        if attention_mask is not None:
+            # compute real output lengths according to convolution formula
+            output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, -1))
+
+            attention_mask = tf.sequence_mask(
+                output_lengths, maxlen=shape_list(hidden_states)[1], dtype=hidden_states.dtype
+            )
+
+        hidden_states = self.feature_projection(hidden_states, training=training)
+
+        mask_time_indices = kwargs.get("mask_time_indices")
+        if training:
+            hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TFHubertPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = HubertConfig
+    base_model_prefix = "hubert"
+    main_input_name = "input_values"
+
+    @property
+    def input_signature(self):
+        return {
+            "input_values": tf.TensorSpec((None, 16000), tf.float32, name="input_values"),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
+        }
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        logger.warning(
+            f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
+            "to train/fine-tune this model, you need a GPU or a TPU"
+        )
+
+
+HUBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_values` only and nothing else: `model(input_values)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_values, attention_mask])` or `model([input_values, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_values": input_values, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+HUBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` `dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_values` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_values` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare TFHubert Model transformer outputting raw hidden-states without any specific head on top.",
+    HUBERT_START_DOCSTRING,
+)
+class TFHubertModel(TFHubertPreTrainedModel):
+    def __init__(self, config: HubertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.config = config
+        self.hubert = TFHubertMainLayer(config, name="hubert")
+
+    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        """
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFHubertModel
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+
+
+        >>> def map_to_array(example):
+        ...     example["speech"] = example["audio"]["array"]
+        ...     return example
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
+
+        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
+        output_attentions = output_attentions if output_attentions else self.config.output_attentions
+        return_dict = return_dict if return_dict else self.config.return_dict
+
+        outputs = self.hubert(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "hubert", None) is not None:
+            with tf.name_scope(self.hubert.name):
+                self.hubert.build(None)
+
+
+@add_start_docstrings(
+    """TFHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    HUBERT_START_DOCSTRING,
+)
+class TFHubertForCTC(TFHubertPreTrainedModel):
+    def __init__(self, config: HubertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.hubert = TFHubertMainLayer(config, name="hubert")
+        self.dropout = keras.layers.Dropout(config.final_dropout)
+        self.lm_head = keras.layers.Dense(config.vocab_size, name="lm_head")
+        self.output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.hubert.feature_extractor.trainable = False
+
+    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFCausalLMOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        labels: tf.Tensor | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFCausalLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_values` docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoProcessor, TFHubertForCTC
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
+
+
+        >>> def map_to_array(example):
+        ...     example["speech"] = example["audio"]["array"]
+        ...     return example
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
+        >>> logits = model(input_values).logits
+        >>> predicted_ids = tf.argmax(logits, axis=-1)
+
+        >>> transcription = processor.decode(predicted_ids[0])
+
+        >>> # compute loss
+        >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
+
+        >>> # Pass the transcription as text to encode labels
+        >>> labels = processor(text=transcription, return_tensors="tf").input_values
+
+        >>> loss = model(input_values, labels=labels).loss
+        ```"""
+        if labels is not None and tf.reduce_max(labels) >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.hubert(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        logits = self.lm_head(hidden_states)
+
+        if labels is not None:
+            attention_mask = (
+                attention_mask if attention_mask is not None else tf.ones_like(input_values, dtype=tf.float32)
+            )
+            input_lengths = self.hubert._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, axis=-1))
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = tf.cast(labels >= 0, tf.int32)
+            target_lengths = tf.reduce_sum(labels_mask, axis=-1)
+
+            loss = tf.nn.ctc_loss(
+                logits=logits,
+                labels=labels,
+                logit_length=input_lengths,
+                label_length=target_lengths,
+                blank_index=self.config.pad_token_id,
+                logits_time_major=False,
+            )
+
+            if self.config.ctc_loss_reduction == "sum":
+                loss = tf.reduce_sum(loss)
+                loss = tf.reshape(loss, (1,))
+            if self.config.ctc_loss_reduction == "mean":
+                loss = tf.reduce_mean(loss)
+                loss = tf.reshape(loss, (1,))
+        else:
+            loss = None
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "hubert", None) is not None:
+            with tf.name_scope(self.hubert.name):
+                self.hubert.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build([None, None, self.output_hidden_size])
+
+
+__all__ = ["TFHubertForCTC", "TFHubertModel", "TFHubertPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hubert/modular_hubert.py b/phivenv/Lib/site-packages/transformers/models/hubert/modular_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..facebcf445e6bdfd75c7880aefad84e136f4da88
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hubert/modular_hubert.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Hubert model."""
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Encoder,
+    Wav2Vec2EncoderStableLayerNorm,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2Model,
+    Wav2Vec2SamePadLayer,
+)
+from .configuration_hubert import HubertConfig
+
+
+_HIDDEN_STATES_START_POSITION = 1
+
+
+class HubertPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.batch_norm = None
+        if config.conv_pos_batch_norm:
+            self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        else:
+            weight_norm = nn.utils.weight_norm
+            if hasattr(nn.utils.parametrizations, "weight_norm"):
+                weight_norm = nn.utils.parametrizations.weight_norm
+
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                    self.conv = weight_norm(self.conv, name="weight", dim=2)
+                if hasattr(self.conv, "parametrizations"):
+                    weight_g = self.conv.parametrizations.weight.original0
+                    weight_v = self.conv.parametrizations.weight.original1
+                else:
+                    weight_g = self.conv.weight_g
+                    weight_v = self.conv.weight_v
+                deepspeed.zero.register_external_parameter(self, weight_v)
+                deepspeed.zero.register_external_parameter(self, weight_g)
+            else:
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        if self.batch_norm is not None:
+            hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class HubertSamePadLayer(Wav2Vec2SamePadLayer):
+    pass
+
+
+class HubertFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class HubertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.feat_proj_layer_norm = config.feat_proj_layer_norm
+        if self.feat_proj_layer_norm:
+            self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        if self.feat_proj_layer_norm:
+            hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class HubertEncoder(Wav2Vec2Encoder):
+    pass
+
+
+class HubertEncoderStableLayerNorm(Wav2Vec2EncoderStableLayerNorm):
+    pass
+
+
+@auto_docstring
+class HubertPreTrainedModel(PreTrainedModel):
+    config: HubertConfig
+    base_model_prefix = "hubert"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
+                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+                else:
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+            else:
+                nn.init.kaiming_normal_(module.weight.data)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, HubertModel):
+            if hasattr(module, "masked_spec_embed"):
+                module.masked_spec_embed.data.uniform_()
+        elif isinstance(module, HubertForSequenceClassification):
+            if hasattr(module, "layer_weights"):
+                module.layer_weights.data.fill_(1.0 / (self.config.num_hidden_layers + 1))
+
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
+    def __init__(self, config: HubertConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = HubertFeatureEncoder(config)
+        self.feature_projection = HubertFeatureProjection(config)
+
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = HubertEncoderStableLayerNorm(config)
+        else:
+            self.encoder = HubertEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        del self.adapter
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Hubert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Hubert")
+
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, HubertModel
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+
+
+        >>> def map_to_array(example):
+        ...     example["speech"] = example["audio"]["array"]
+        ...     return example
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class HubertForCTC(Wav2Vec2ForCTC):
+    pass
+
+
+class HubertForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    pass
+
+
+__all__ = ["HubertForCTC", "HubertForSequenceClassification", "HubertModel", "HubertPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__init__.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27de691c845369b97805fb53a8e509b7e948b386
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__init__.py
@@ -0,0 +1,15 @@
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hunyuan_v1_dense import *
+    from .modeling_hunyuan_v1_dense import *
+    from .tokenization_hy import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9011c140b0608e589116807db7282ad01f73eec7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62cf0a3344aac6e42d0a134bd87b26d321ff3ad7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..161b2394b78e9a1c3c3c36dfed0c3333394c8d7a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d8714f343abaefaf36e9d32ee419700a83351f9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py
new file mode 100644
index 0000000000000000000000000000000000000000..064b0a9702ccc1a1bd38dfe889bd0eb88291fac0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HunYuanDenseV1 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunYuanDenseV1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanDenseV1Config`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanDenseV1Config`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """
+
+    model_type = "hunyuan_v1_dense"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none")
+        if rope_scaling_factor is not None:
+            if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+                raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}")
+        if rope_scaling_alpha is not None:
+            if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0:
+                raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}")
+
+
+__all__ = ["HunYuanDenseV1Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py
new file mode 100644
index 0000000000000000000000000000000000000000..120cc5edbd0c34ecdd201f8fcc42f7a9d2453a1a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py
@@ -0,0 +1,514 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hunyuan_v1_dense.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.cache_utils import Cache
+
+from ...activations import ACT2FN
+from ...cache_utils import DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_hunyuan_v1_dense import HunYuanDenseV1Config
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class HunYuanDenseV1RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        HunYuanDenseV1RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class HunYuanDenseV1MLP(nn.Module):
+    def __init__(self, config: HunYuanDenseV1Config, layer_idx=None, is_shared_mlp=False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class HunYuanDenseV1Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: HunYuanDenseV1Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.query_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.key_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states = self.query_layernorm(query_states)
+        key_states = self.key_layernorm(key_states)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HunYuanDenseV1DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: HunYuanDenseV1Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = HunYuanDenseV1Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = HunYuanDenseV1MLP(config)
+        self.input_layernorm = HunYuanDenseV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HunYuanDenseV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class HunYuanDenseV1PreTrainedModel(PreTrainedModel):
+    config: HunYuanDenseV1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HunYuanDenseV1DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": HunYuanDenseV1DecoderLayer,
+        "attentions": HunYuanDenseV1Attention,
+    }
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class HunYuanDenseV1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: HunYuanDenseV1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        if self.rope_type == "dynamic" and config.rope_scaling["alpha"]:
+            # DynamicNTKAlphaRotary
+            self.dim = config.head_dim
+            base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.attention_scaling = 1.0
+        else:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class HunYuanDenseV1Model(HunYuanDenseV1PreTrainedModel):
+    def __init__(self, config: HunYuanDenseV1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HunYuanDenseV1DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HunYuanDenseV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HunYuanDenseV1RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class HunYuanDenseV1ForCausalLM(HunYuanDenseV1PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HunYuanDenseV1Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, HunYuanDenseV1ForCausalLM
+
+        >>> model = HunYuanDenseV1ForCausalLM.from_pretrained("meta-hunyuan_v1_dense/HunYuanDenseV1-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-hunyuan_v1_dense/HunYuanDenseV1-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class HunYuanDenseV1ForSequenceClassification(GenericForSequenceClassification, HunYuanDenseV1PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "HunYuanDenseV1ForCausalLM",
+    "HunYuanDenseV1Model",
+    "HunYuanDenseV1PreTrainedModel",
+    "HunYuanDenseV1ForSequenceClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py
new file mode 100644
index 0000000000000000000000000000000000000000..c79ccc6a616dd343d5aa11b3aec4ae78320c8830
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch HunYuanDenseV1 model."""
+
+from typing import Callable, Optional
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.cache_utils import Cache
+from transformers.utils import (
+    logging,
+)
+
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaForSequenceClassification,
+    LlamaMLP,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_hunyuan_v1_dense import HunYuanDenseV1Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunYuanDenseV1RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class HunYuanDenseV1MLP(LlamaMLP):
+    def __init__(self, config: HunYuanDenseV1Config, layer_idx=None, is_shared_mlp=False):
+        super().__init__(config)
+        self.layer_idx = layer_idx
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class HunYuanDenseV1Attention(LlamaAttention):
+    def __init__(self, config: HunYuanDenseV1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.query_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.key_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states = self.query_layernorm(query_states)
+        key_states = self.key_layernorm(key_states)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HunYuanDenseV1DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: HunYuanDenseV1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.layer_idx = layer_idx
+
+
+class HunYuanDenseV1PreTrainedModel(LlamaPreTrainedModel):
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class HunYuanDenseV1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: HunYuanDenseV1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        if self.rope_type == "dynamic" and config.rope_scaling["alpha"]:
+            # DynamicNTKAlphaRotary
+            self.dim = config.head_dim
+            base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.attention_scaling = 1.0
+        else:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class HunYuanDenseV1Model(LlamaModel):
+    pass
+
+
+class HunYuanDenseV1ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class HunYuanDenseV1ForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "HunYuanDenseV1ForCausalLM",
+    "HunYuanDenseV1Model",
+    "HunYuanDenseV1PreTrainedModel",
+    "HunYuanDenseV1ForSequenceClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__init__.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd107ee7a3c16d2527806035f67702e9220cae51
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__init__.py
@@ -0,0 +1,14 @@
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hunyuan_v1_moe import *
+    from .modeling_hunyuan import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b1699c7d548925d3bab02149a0218ce16b184c2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/configuration_hunyuan_v1_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/configuration_hunyuan_v1_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe2c15e99d87c2eadeaabab2cc9205c0068ecd69
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/configuration_hunyuan_v1_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modeling_hunyuan_v1_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modeling_hunyuan_v1_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2957cea2c772311fa763ba66db3b6a07e64f0a4a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modeling_hunyuan_v1_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modular_hunyuan_v1_moe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modular_hunyuan_v1_moe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee626f7e8650fe8a7e8fc5ed482fca2a58344740
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modular_hunyuan_v1_moe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..386ddac1d3ebb37de330a3940ad5ac556be5bcf6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HunYuanMoEV1 model configuration"""
+
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunYuanMoEV1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanMoEV1Model`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-A13B-Instruct [tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanMoEV1Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        sep_token_id (`int`, *optional*, defaults to 4):
+            Token ID representing the separator token (`[SEP]`), used to demarcate boundaries between different text segments.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts (`int` or `List`, *optional*, defaults to 1):
+            The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.
+        moe_topk (int or List, *optional*, defaults to 1):
+            Number of experts selected per token (Top-K routing). List form enables layer-wise customization.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """
+
+    model_type = "hunyuan_v1_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        sep_token_id=4,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_experts: Union[int, list] = 1,
+        moe_topk: Union[int, list] = 1,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_experts = num_experts
+        self.moe_topk = moe_topk
+
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            sep_token_id=sep_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none")
+        if rope_scaling_factor is not None:
+            if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+                raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}")
+        if rope_scaling_alpha is not None:
+            if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0:
+                raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}")
+
+
+__all__ = ["HunYuanMoEV1Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..043d1f8243a35638747f3ff44946748dc472fcf1
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py
@@ -0,0 +1,584 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hunyuan_v1_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.cache_utils import Cache
+
+from ...activations import ACT2FN
+from ...cache_utils import DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class HunYuanMoEV1RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        HunYuanMoEV1RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class HunYuanMoEV1MLP(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class HunYuanMoEV1Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states = self.query_layernorm(query_states)
+        key_states = self.key_layernorm(key_states)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HunYuanMoEV1Gate(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32)
+
+    def forward(self, hidden_states):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_size)
+        if self.wg.weight.dtype == torch.float32:
+            hidden_states = hidden_states.float()
+        logits = self.wg(hidden_states)
+        return logits
+
+
+class HunYuanMoEV1Moe(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
+        self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx)
+        # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList(
+            [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)]
+        )
+
+        self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states_mlp = self.shared_mlp(hidden_states)
+        router_logits = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states + hidden_states_mlp
+
+
+class HunYuanMoEV1DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx)
+        self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx)
+        self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class HunYuanMoEV1PreTrainedModel(PreTrainedModel):
+    config: HunYuanMoEV1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HunYuanMoEV1DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": HunYuanMoEV1DecoderLayer,
+        "attentions": HunYuanMoEV1Attention,
+    }
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class HunYuanMoEV1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: HunYuanMoEV1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        if self.rope_type == "dynamic" and config.rope_scaling["alpha"]:
+            # DynamicNTKAlphaRotary
+            self.dim = config.head_dim
+            base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.attention_scaling = 1.0
+        else:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class HunYuanMoEV1Model(HunYuanMoEV1PreTrainedModel):
+    def __init__(self, config: HunYuanMoEV1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HunYuanMoEV1DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HunYuanMoEV1RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class HunYuanMoEV1ForCausalLM(HunYuanMoEV1PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HunYuanMoEV1Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, HunYuanMoEV1ForCausalLM
+
+        >>> model = HunYuanMoEV1ForCausalLM.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class HunYuanMoEV1ForSequenceClassification(GenericForSequenceClassification, HunYuanMoEV1PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "HunYuanMoEV1ForCausalLM",
+    "HunYuanMoEV1Model",
+    "HunYuanMoEV1PreTrainedModel",
+    "HunYuanMoEV1ForSequenceClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..645c54ae73afd1b43db807535c0ac36957eac7ff
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch HunYuanMoEV1 model."""
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.cache_utils import Cache
+from transformers.utils import (
+    logging,
+)
+
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaForSequenceClassification,
+    LlamaMLP,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunYuanMoEV1RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class HunYuanMoEV1MLP(LlamaMLP):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False):
+        super().__init__(config)
+        self.layer_idx = layer_idx
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class HunYuanMoEV1Attention(LlamaAttention):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states = self.query_layernorm(query_states)
+        key_states = self.key_layernorm(key_states)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HunYuanMoEV1Gate(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32)
+
+    def forward(self, hidden_states):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_size)
+        if self.wg.weight.dtype == torch.float32:
+            hidden_states = hidden_states.float()
+        logits = self.wg(hidden_states)
+        return logits
+
+
+class HunYuanMoEV1Moe(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
+        self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx)
+        # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList(
+            [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)]
+        )
+
+        self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states_mlp = self.shared_mlp(hidden_states)
+        router_logits = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states + hidden_states_mlp
+
+
+class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.hidden_size = config.hidden_size
+        self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx)
+        self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx)
+        self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+
+class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel):
+    _can_compile_fullgraph = False
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class HunYuanMoEV1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: HunYuanMoEV1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        if self.rope_type == "dynamic" and config.rope_scaling["alpha"]:
+            # DynamicNTKAlphaRotary
+            self.dim = config.head_dim
+            base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.attention_scaling = 1.0
+        else:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class HunYuanMoEV1Model(LlamaModel):
+    pass
+
+
+class HunYuanMoEV1ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class HunYuanMoEV1ForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "HunYuanMoEV1ForCausalLM",
+    "HunYuanMoEV1Model",
+    "HunYuanMoEV1PreTrainedModel",
+    "HunYuanMoEV1ForSequenceClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/__init__.py b/phivenv/Lib/site-packages/transformers/models/ibert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf34ec43ac1014d8c153b3aa259e394fc7b73570
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ibert/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ibert import *
+    from .modeling_ibert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf37303d970e639407c7a18bc556af8c30280493
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56d52311e6496db5ede349e6b899ffb1cfa56d8b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93359b9e438ae2ee46c6f68a65cd94f476505b81
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40feaafe91459ee887b99ba36092f9ad6df41d50
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/configuration_ibert.py b/phivenv/Lib/site-packages/transformers/models/ibert/configuration_ibert.py
new file mode 100644
index 0000000000000000000000000000000000000000..963e6e6c9ed00bcb40dce7c4354110ee9b487187
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ibert/configuration_ibert.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,
+# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.
+# Copyright (c) 20121, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""I-BERT configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class IBertConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`IBertModel`]. It is used to instantiate a I-BERT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the IBERT
+    [kssteven/ibert-roberta-base](https://huggingface.co/kssteven/ibert-roberta-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the I-BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`IBertModel`]
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`IBertModel`]
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether to quantize the model or not.
+        force_dequant (`str`, *optional*, defaults to `"none"`):
+            Force dequantize specific nonlinear layer. Dequantized layers are then executed with full precision.
+            `"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As default, it is set as
+            `"none"`, which does not dequantize any layers. Please specify `"gelu"`, `"softmax"`, or `"layernorm"` to
+            dequantize GELU, Softmax, or LayerNorm, respectively. `"nonlinear"` will dequantize all nonlinear layers,
+            i.e., GELU, Softmax, and LayerNorm.
+    """
+
+    model_type = "ibert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        quant_mode=False,
+        force_dequant="none",
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.quant_mode = quant_mode
+        self.force_dequant = force_dequant
+
+
+class IBertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["IBertConfig", "IBertOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/modeling_ibert.py b/phivenv/Lib/site-packages/transformers/models/ibert/modeling_ibert.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b960148ca9b8cfb416eb6bb82d12c1cb424f695
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ibert/modeling_ibert.py
@@ -0,0 +1,1254 @@
+# coding=utf-8
+# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,
+# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.
+# Copyright (c) 20121, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch I-BERT model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import gelu
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, logging
+from .configuration_ibert import IBertConfig
+from .quant_modules import IntGELU, IntLayerNorm, IntSoftmax, QuantAct, QuantEmbedding, QuantLinear
+
+
+logger = logging.get_logger(__name__)
+
+
+class IBertEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.quant_mode = config.quant_mode
+        self.embedding_bit = 8
+        self.embedding_act_bit = 16
+        self.act_bit = 8
+        self.ln_input_bit = 22
+        self.ln_output_bit = 32
+
+        self.word_embeddings = QuantEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id,
+            weight_bit=self.embedding_bit,
+            quant_mode=self.quant_mode,
+        )
+        self.token_type_embeddings = QuantEmbedding(
+            config.type_vocab_size, config.hidden_size, weight_bit=self.embedding_bit, quant_mode=self.quant_mode
+        )
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = QuantEmbedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+            weight_bit=self.embedding_bit,
+            quant_mode=self.quant_mode,
+        )
+
+        # Integer-only addition between embeddings
+        self.embeddings_act1 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
+        self.embeddings_act2 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = IntLayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            output_bit=self.ln_output_bit,
+            quant_mode=self.quant_mode,
+            force_dequant=config.force_dequant,
+        )
+        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids, self.padding_idx, past_key_values_length
+                ).to(input_ids.device)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds, inputs_embeds_scaling_factor = self.word_embeddings(input_ids)
+        else:
+            inputs_embeds_scaling_factor = None
+        token_type_embeddings, token_type_embeddings_scaling_factor = self.token_type_embeddings(token_type_ids)
+
+        embeddings, embeddings_scaling_factor = self.embeddings_act1(
+            inputs_embeds,
+            inputs_embeds_scaling_factor,
+            identity=token_type_embeddings,
+            identity_scaling_factor=token_type_embeddings_scaling_factor,
+        )
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings, position_embeddings_scaling_factor = self.position_embeddings(position_ids)
+            embeddings, embeddings_scaling_factor = self.embeddings_act1(
+                embeddings,
+                embeddings_scaling_factor,
+                identity=position_embeddings,
+                identity_scaling_factor=position_embeddings_scaling_factor,
+            )
+
+        embeddings, embeddings_scaling_factor = self.LayerNorm(embeddings, embeddings_scaling_factor)
+        embeddings = self.dropout(embeddings)
+        embeddings, embeddings_scaling_factor = self.output_activation(embeddings, embeddings_scaling_factor)
+        return embeddings, embeddings_scaling_factor
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+class IBertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.quant_mode = config.quant_mode
+        self.weight_bit = 8
+        self.bias_bit = 32
+        self.act_bit = 8
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        # Q, K, V Linear layers
+        self.query = QuantLinear(
+            config.hidden_size,
+            self.all_head_size,
+            bias=True,
+            weight_bit=self.weight_bit,
+            bias_bit=self.bias_bit,
+            quant_mode=self.quant_mode,
+            per_channel=True,
+        )
+        self.key = QuantLinear(
+            config.hidden_size,
+            self.all_head_size,
+            bias=True,
+            weight_bit=self.weight_bit,
+            bias_bit=self.bias_bit,
+            quant_mode=self.quant_mode,
+            per_channel=True,
+        )
+        self.value = QuantLinear(
+            config.hidden_size,
+            self.all_head_size,
+            bias=True,
+            weight_bit=self.weight_bit,
+            bias_bit=self.bias_bit,
+            quant_mode=self.quant_mode,
+            per_channel=True,
+        )
+
+        # Requantization (32bit -> 8bit) for Q, K, V activations
+        self.query_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+        self.key_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+        self.value_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type != "absolute":
+            raise ValueError("I-BERT only supports 'absolute' for `config.position_embedding_type`")
+
+        self.softmax = IntSoftmax(self.act_bit, quant_mode=self.quant_mode, force_dequant=config.force_dequant)
+
+    def forward(
+        self,
+        hidden_states,
+        hidden_states_scaling_factor,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        # Projection
+        mixed_query_layer, mixed_query_layer_scaling_factor = self.query(hidden_states, hidden_states_scaling_factor)
+        mixed_key_layer, mixed_key_layer_scaling_factor = self.key(hidden_states, hidden_states_scaling_factor)
+        mixed_value_layer, mixed_value_layer_scaling_factor = self.value(hidden_states, hidden_states_scaling_factor)
+
+        # Requantization
+        query_layer, query_layer_scaling_factor = self.query_activation(
+            mixed_query_layer, mixed_query_layer_scaling_factor
+        )
+        key_layer, key_layer_scaling_factor = self.key_activation(mixed_key_layer, mixed_key_layer_scaling_factor)
+        value_layer, value_layer_scaling_factor = self.value_activation(
+            mixed_value_layer, mixed_value_layer_scaling_factor
+        )
+
+        # Transpose
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+        key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        value_layer = value_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        scale = math.sqrt(self.attention_head_size)
+        attention_scores = attention_scores / scale
+        if self.quant_mode:
+            attention_scores_scaling_factor = query_layer_scaling_factor * key_layer_scaling_factor / scale
+        else:
+            attention_scores_scaling_factor = None
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in IBertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs, attention_probs_scaling_factor = self.softmax(
+            attention_scores, attention_scores_scaling_factor
+        )
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        if attention_probs_scaling_factor is not None:
+            context_layer_scaling_factor = attention_probs_scaling_factor * value_layer_scaling_factor
+        else:
+            context_layer_scaling_factor = None
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # requantization: 32-bit -> 8-bit
+        context_layer, context_layer_scaling_factor = self.output_activation(
+            context_layer, context_layer_scaling_factor
+        )
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        output_scaling_factor = (
+            (context_layer_scaling_factor, attention_probs_scaling_factor)
+            if output_attentions
+            else (context_layer_scaling_factor,)
+        )
+
+        return outputs, output_scaling_factor
+
+
+class IBertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.quant_mode = config.quant_mode
+        self.act_bit = 8
+        self.weight_bit = 8
+        self.bias_bit = 32
+        self.ln_input_bit = 22
+        self.ln_output_bit = 32
+
+        self.dense = QuantLinear(
+            config.hidden_size,
+            config.hidden_size,
+            bias=True,
+            weight_bit=self.weight_bit,
+            bias_bit=self.bias_bit,
+            quant_mode=self.quant_mode,
+            per_channel=True,
+        )
+        self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)
+        self.LayerNorm = IntLayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            output_bit=self.ln_output_bit,
+            quant_mode=self.quant_mode,
+            force_dequant=config.force_dequant,
+        )
+        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
+        hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states, hidden_states_scaling_factor = self.ln_input_act(
+            hidden_states,
+            hidden_states_scaling_factor,
+            identity=input_tensor,
+            identity_scaling_factor=input_tensor_scaling_factor,
+        )
+        hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)
+
+        hidden_states, hidden_states_scaling_factor = self.output_activation(
+            hidden_states, hidden_states_scaling_factor
+        )
+        return hidden_states, hidden_states_scaling_factor
+
+
+class IBertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.quant_mode = config.quant_mode
+        self.self = IBertSelfAttention(config)
+        self.output = IBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        hidden_states_scaling_factor,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_outputs, self_outputs_scaling_factor = self.self(
+            hidden_states,
+            hidden_states_scaling_factor,
+            attention_mask,
+            head_mask,
+            output_attentions,
+        )
+        attention_output, attention_output_scaling_factor = self.output(
+            self_outputs[0], self_outputs_scaling_factor[0], hidden_states, hidden_states_scaling_factor
+        )
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        outputs_scaling_factor = (attention_output_scaling_factor,) + self_outputs_scaling_factor[1:]
+        return outputs, outputs_scaling_factor
+
+
+class IBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.quant_mode = config.quant_mode
+        self.act_bit = 8
+        self.weight_bit = 8
+        self.bias_bit = 32
+        self.dense = QuantLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            weight_bit=self.weight_bit,
+            bias_bit=self.bias_bit,
+            quant_mode=self.quant_mode,
+            per_channel=True,
+        )
+        if config.hidden_act != "gelu":
+            raise ValueError("I-BERT only supports 'gelu' for `config.hidden_act`")
+        self.intermediate_act_fn = IntGELU(quant_mode=self.quant_mode, force_dequant=config.force_dequant)
+        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+
+    def forward(self, hidden_states, hidden_states_scaling_factor):
+        hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
+        hidden_states, hidden_states_scaling_factor = self.intermediate_act_fn(
+            hidden_states, hidden_states_scaling_factor
+        )
+
+        # Requantization: 32bit -> 8-bit
+        hidden_states, hidden_states_scaling_factor = self.output_activation(
+            hidden_states, hidden_states_scaling_factor
+        )
+        return hidden_states, hidden_states_scaling_factor
+
+
+class IBertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.quant_mode = config.quant_mode
+        self.act_bit = 8
+        self.weight_bit = 8
+        self.bias_bit = 32
+        self.ln_input_bit = 22
+        self.ln_output_bit = 32
+
+        self.dense = QuantLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            weight_bit=self.weight_bit,
+            bias_bit=self.bias_bit,
+            quant_mode=self.quant_mode,
+            per_channel=True,
+        )
+        self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)
+        self.LayerNorm = IntLayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            output_bit=self.ln_output_bit,
+            quant_mode=self.quant_mode,
+            force_dequant=config.force_dequant,
+        )
+        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
+        hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states, hidden_states_scaling_factor = self.ln_input_act(
+            hidden_states,
+            hidden_states_scaling_factor,
+            identity=input_tensor,
+            identity_scaling_factor=input_tensor_scaling_factor,
+        )
+        hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)
+
+        hidden_states, hidden_states_scaling_factor = self.output_activation(
+            hidden_states, hidden_states_scaling_factor
+        )
+        return hidden_states, hidden_states_scaling_factor
+
+
+class IBertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.quant_mode = config.quant_mode
+        self.act_bit = 8
+
+        self.seq_len_dim = 1
+        self.attention = IBertAttention(config)
+        self.intermediate = IBertIntermediate(config)
+        self.output = IBertOutput(config)
+
+        self.pre_intermediate_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+        self.pre_output_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)
+
+    def forward(
+        self,
+        hidden_states,
+        hidden_states_scaling_factor,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_attention_outputs, self_attention_outputs_scaling_factor = self.attention(
+            hidden_states,
+            hidden_states_scaling_factor,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        attention_output_scaling_factor = self_attention_outputs_scaling_factor[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output, layer_output_scaling_factor = self.feed_forward_chunk(
+            attention_output, attention_output_scaling_factor
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output, attention_output_scaling_factor):
+        attention_output, attention_output_scaling_factor = self.pre_intermediate_act(
+            attention_output, attention_output_scaling_factor
+        )
+        intermediate_output, intermediate_output_scaling_factor = self.intermediate(
+            attention_output, attention_output_scaling_factor
+        )
+
+        intermediate_output, intermediate_output_scaling_factor = self.pre_output_act(
+            intermediate_output, intermediate_output_scaling_factor
+        )
+        layer_output, layer_output_scaling_factor = self.output(
+            intermediate_output, intermediate_output_scaling_factor, attention_output, attention_output_scaling_factor
+        )
+        return layer_output, layer_output_scaling_factor
+
+
+class IBertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.quant_mode = config.quant_mode
+        self.layer = nn.ModuleList([IBertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        hidden_states_scaling_factor,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = None  # `config.add_cross_attention` is not supported
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                hidden_states_scaling_factor,
+                attention_mask,
+                layer_head_mask,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class IBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.quant_mode = config.quant_mode
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class IBertPreTrainedModel(PreTrainedModel):
+    config: IBertConfig
+    base_model_prefix = "ibert"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (QuantLinear, nn.Linear)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (QuantEmbedding, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, IBertLMHead):
+            module.bias.data.zero_()
+
+    def resize_token_embeddings(self, new_num_tokens=None):
+        raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
+
+
+@auto_docstring
+class IBertModel(IBertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.quant_mode = config.quant_mode
+
+        self.embeddings = IBertEmbeddings(config)
+        self.encoder = IBertEncoder(config)
+
+        self.pooler = IBertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, tuple[torch.FloatTensor]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output, embedding_output_scaling_factor = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            embedding_output_scaling_factor,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@auto_docstring
+class IBertForMaskedLM(IBertPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ibert = IBertModel(config, add_pooling_layer=False)
+        self.lm_head = IBertLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+        self.lm_head.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ibert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class IBertLMHead(nn.Module):
+    """I-BERT Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self) -> None:
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+            self.bias = self.decoder.bias
+
+
+@auto_docstring(
+    custom_intro="""
+    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """
+)
+class IBertForSequenceClassification(IBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ibert = IBertModel(config, add_pooling_layer=False)
+        self.classifier = IBertClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ibert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class IBertForMultipleChoice(IBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ibert = IBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.ibert(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class IBertForTokenClassification(IBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ibert = IBertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ibert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class IBertClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        hidden_states = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class IBertForQuestionAnswering(IBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ibert = IBertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[QuestionAnsweringModelOutput, tuple[torch.FloatTensor]]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ibert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's *utils.make_positions*.
+
+    Args:
+    input_ids (`torch.LongTensor`):
+           Indices of input sequence tokens in the vocabulary.
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+    "IBertForMaskedLM",
+    "IBertForMultipleChoice",
+    "IBertForQuestionAnswering",
+    "IBertForSequenceClassification",
+    "IBertForTokenClassification",
+    "IBertModel",
+    "IBertPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/ibert/quant_modules.py b/phivenv/Lib/site-packages/transformers/models/ibert/quant_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..949702a5af97da779cb6dab842b0029d274417dc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ibert/quant_modules.py
@@ -0,0 +1,820 @@
+# coding=utf-8
+# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,
+# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.
+# Copyright (c) 20121, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import decimal
+
+import numpy as np
+import torch
+from torch import nn
+from torch.autograd import Function
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class QuantEmbedding(nn.Module):
+    """
+    Quantized version of `torch.nn.Embedding`. Adds quantization-specific arguments on top of `torch.nn.Embedding`.
+
+    Args:
+        weight_bit (`int`, *optional*, defaults to `8`):
+            Bitwidth for the quantized weight.
+        momentum (`float`, *optional*, defaults to `0.95`):
+            Momentum for updating the activation quantization range.
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+        _weight=None,
+        weight_bit=8,
+        momentum=0.95,
+        quant_mode=False,
+    ):
+        super().__init__()
+        self.num_ = num_embeddings
+        self.dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+
+        self.weight = nn.Parameter(torch.zeros([num_embeddings, embedding_dim]))
+        self.register_buffer("weight_scaling_factor", torch.zeros(1))
+        self.register_buffer("weight_integer", torch.zeros_like(self.weight))
+
+        self.weight_bit = weight_bit
+        self.momentum = momentum
+        self.quant_mode = quant_mode
+        self.percentile_mode = False
+        self.weight_function = SymmetricQuantFunction.apply
+
+    def forward(self, x, positions=None, incremental_state=None):
+        if not self.quant_mode:
+            return (
+                nn.functional.embedding(
+                    x,
+                    self.weight,
+                    self.padding_idx,
+                    self.max_norm,
+                    self.norm_type,
+                    self.scale_grad_by_freq,
+                    self.sparse,
+                ),
+                None,
+            )
+
+        w = self.weight
+        w_transform = w.data.detach()
+        w_min = w_transform.min().expand(1)
+        w_max = w_transform.max().expand(1)
+
+        self.weight_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, False)
+        self.weight_integer = self.weight_function(
+            self.weight, self.weight_bit, self.percentile_mode, self.weight_scaling_factor
+        )
+
+        emb_int = nn.functional.embedding(
+            x,
+            self.weight_integer,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+        return emb_int * self.weight_scaling_factor, self.weight_scaling_factor
+
+
+class QuantAct(nn.Module):
+    """
+    Quantizes the given activation.
+
+    Args:
+        activation_bit (`int`):
+            Bitwidth for the quantized activation.
+        act_range_momentum (`float`, *optional*, defaults to `0.95`):
+            Momentum for updating the activation quantization range.
+        per_channel (`bool`, *optional*, defaults to `False`):
+            Whether to or not use channel-wise quantization.
+        channel_len (`int`, *optional*):
+            Specify the channel length when set the *per_channel* True.
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+    """
+
+    def __init__(self, activation_bit, act_range_momentum=0.95, per_channel=False, channel_len=None, quant_mode=False):
+        super().__init__()
+
+        self.activation_bit = activation_bit
+        self.act_range_momentum = act_range_momentum
+        self.quant_mode = quant_mode
+        self.per_channel = per_channel
+        self.percentile = False
+        self.act_function = SymmetricQuantFunction.apply
+
+        if not self.per_channel:
+            self.register_buffer("x_min", torch.zeros(1))
+            self.register_buffer("x_max", torch.zeros(1))
+            self.register_buffer("act_scaling_factor", torch.zeros(1))
+            self.x_min -= 1e-5
+            self.x_max += 1e-5
+        else:
+            raise NotImplementedError("per-channel mode is not currently supported for activation.")
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}(activation_bit={self.activation_bit}, "
+            f"quant_mode: {self.quant_mode}, Act_min: {self.x_min.item():.2f}, "
+            f"Act_max: {self.x_max.item():.2f})"
+        )
+
+    def forward(
+        self,
+        x,
+        pre_act_scaling_factor=None,
+        identity=None,
+        identity_scaling_factor=None,
+        specified_min=None,
+        specified_max=None,
+    ):
+        x_act = x if identity is None else identity + x
+        # collect running stats if training
+        if self.training:
+            assert not self.percentile, "percentile mode is not currently supported for activation."
+            assert not self.per_channel, "per-channel mode is not currently supported for activation."
+            x_min = x_act.data.min()
+            x_max = x_act.data.max()
+
+            assert x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0, (
+                "NaN detected when computing min/max of the activation"
+            )
+
+            # Initialization
+            if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
+                self.x_min = self.x_min + x_min
+                self.x_max = self.x_max + x_max
+
+            # exponential moving average (EMA)
+            # use momentum to prevent the quantized values change greatly every iteration
+            elif self.act_range_momentum == -1:
+                self.x_min = torch.min(self.x_min, x_min)
+                self.x_max = torch.max(self.x_max, x_max)
+            else:
+                self.x_min = self.x_min * self.act_range_momentum + x_min * (1 - self.act_range_momentum)
+                self.x_max = self.x_max * self.act_range_momentum + x_max * (1 - self.act_range_momentum)
+
+        if not self.quant_mode:
+            return x_act, None
+
+        x_min = self.x_min if specified_min is None else specified_min
+        x_max = self.x_max if specified_max is None else specified_max
+
+        self.act_scaling_factor = symmetric_linear_quantization_params(
+            self.activation_bit, x_min, x_max, per_channel=self.per_channel
+        )
+
+        if pre_act_scaling_factor is None:
+            # this is for the input quantization
+            quant_act_int = self.act_function(x, self.activation_bit, self.percentile, self.act_scaling_factor)
+        else:
+            quant_act_int = FixedPointMul.apply(
+                x,
+                pre_act_scaling_factor,
+                self.activation_bit,
+                self.act_scaling_factor,
+                identity,
+                identity_scaling_factor,
+            )
+
+        correct_output_scale = self.act_scaling_factor.view(-1)
+
+        return quant_act_int * correct_output_scale, self.act_scaling_factor
+
+
+class QuantLinear(nn.Module):
+    """
+    Quantized version of `torch.nn.Linear`. Adds quantization-specific arguments on top of `torch.nn.Linear`.
+
+    Args:
+        weight_bit (`int`, *optional*, defaults to `8`):
+            Bitwidth for the quantized weight.
+        bias_bit (`int`, *optional*, defaults to `32`):
+            Bitwidth for the quantized bias.
+        per_channel (`bool`, *optional*, defaults to `False`):
+            Whether or not to use channel-wise quantization.
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+    """
+
+    def __init__(
+        self, in_features, out_features, bias=True, weight_bit=8, bias_bit=32, per_channel=False, quant_mode=False
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.weight = nn.Parameter(torch.zeros([out_features, in_features]))
+        self.register_buffer("weight_integer", torch.zeros_like(self.weight))
+        self.register_buffer("fc_scaling_factor", torch.zeros(self.out_features))
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_features))
+            self.register_buffer("bias_integer", torch.zeros_like(self.bias))
+
+        self.weight_bit = weight_bit
+        self.quant_mode = quant_mode
+        self.per_channel = per_channel
+        self.bias_bit = bias_bit
+        self.quant_mode = quant_mode
+        self.percentile_mode = False
+        self.weight_function = SymmetricQuantFunction.apply
+
+    def __repr__(self):
+        s = super().__repr__()
+        s = f"({s} weight_bit={self.weight_bit}, quant_mode={self.quant_mode})"
+        return s
+
+    def forward(self, x, prev_act_scaling_factor=None):
+        if not self.quant_mode:
+            return nn.functional.linear(x, weight=self.weight, bias=self.bias), None
+
+        # assert that prev_act_scaling_factor is a scalar tensor
+        assert prev_act_scaling_factor is not None and prev_act_scaling_factor.shape == (1,), (
+            "Input activation to the QuantLinear layer should be globally (non-channel-wise) quantized. "
+            "Please add a QuantAct layer with `per_channel = True` before this QuantAct layer"
+        )
+
+        w = self.weight
+        w_transform = w.data.detach()
+        if self.per_channel:
+            w_min, _ = torch.min(w_transform, dim=1, out=None)
+            w_max, _ = torch.max(w_transform, dim=1, out=None)
+        else:
+            w_min = w_transform.min().expand(1)
+            w_max = w_transform.max().expand(1)
+
+        self.fc_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, self.per_channel)
+        self.weight_integer = self.weight_function(
+            self.weight, self.weight_bit, self.percentile_mode, self.fc_scaling_factor
+        )
+
+        bias_scaling_factor = self.fc_scaling_factor * prev_act_scaling_factor
+
+        if self.bias is not None:
+            self.bias_integer = self.weight_function(self.bias, self.bias_bit, False, bias_scaling_factor)
+
+        prev_act_scaling_factor = prev_act_scaling_factor.view(1, -1)
+        x_int = x / prev_act_scaling_factor
+
+        return (
+            nn.functional.linear(x_int, weight=self.weight_integer, bias=self.bias_integer) * bias_scaling_factor,
+            bias_scaling_factor,
+        )
+
+
+class IntGELU(nn.Module):
+    """
+    Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.
+
+    Args:
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+        force_dequant (`str`, *optional*, defaults to `"none"`):
+            Force dequantize the layer if either "gelu" or "nonlinear" is given.
+    """
+
+    def __init__(self, quant_mode=True, force_dequant="none"):
+        super().__init__()
+        self.quant_mode = quant_mode
+
+        if force_dequant in ["nonlinear", "gelu"]:
+            logger.info("Force dequantize gelu")
+            self.quant_mode = False
+
+        if not self.quant_mode:
+            self.activation_fn = nn.GELU()
+
+        self.k = 1.4142
+        self.const = 14  # dummy integer constant
+        self.coeff = [-0.2888, -1.769, 1]  # a(x+b)**2 + c
+        self.coeff[2] /= self.coeff[0]
+
+    def int_erf(self, x_int, scaling_factor):
+        b_int = torch.floor(self.coeff[1] / scaling_factor)
+        c_int = torch.floor(self.coeff[2] / scaling_factor**2)
+        sign = torch.sign(x_int)
+
+        abs_int = torch.min(torch.abs(x_int), -b_int)
+        y_int = sign * ((abs_int + b_int) ** 2 + c_int)
+        scaling_factor = scaling_factor**2 * self.coeff[0]
+
+        # avoid overflow
+        y_int = floor_ste.apply(y_int / 2**self.const)
+        scaling_factor = scaling_factor * 2**self.const
+
+        return y_int, scaling_factor
+
+    def forward(self, x, scaling_factor=None):
+        if not self.quant_mode:
+            return self.activation_fn(x), None
+
+        x_int = x / scaling_factor
+        sigmoid_int, sigmoid_scaling_factor = self.int_erf(x_int, scaling_factor / self.k)
+
+        shift_int = 1.0 // sigmoid_scaling_factor
+
+        x_int = x_int * (sigmoid_int + shift_int)
+        scaling_factor = scaling_factor * sigmoid_scaling_factor / 2
+
+        return x_int * scaling_factor, scaling_factor
+
+
+class IntSoftmax(nn.Module):
+    """
+    Quantized version of `torch.nn.Softmax`. Adds quantization-specific arguments on top of `torch.nn.Softmax`.
+
+    Args:
+        output_bit (`int`):
+            Bitwidth for the layer output activation.
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+        force_dequant (`str`, *optional*, defaults to `"none"`):
+            Force dequantize the layer if either "softmax" or "nonlinear" is given.
+    """
+
+    def __init__(self, output_bit, quant_mode=False, force_dequant="none"):
+        super().__init__()
+        self.output_bit = output_bit
+        self.max_bit = 32
+        self.quant_mode = quant_mode
+
+        if force_dequant in ["nonlinear", "softmax"]:
+            logger.info("Force dequantize softmax")
+            self.quant_mode = False
+
+        self.act = QuantAct(16, quant_mode=self.quant_mode)
+        self.x0 = -0.6931  # -ln2
+        self.const = 30  # dummy integer constant
+        self.coef = [0.35815147, 0.96963238, 1.0]  # ax**2 + bx + c
+        self.coef[1] /= self.coef[0]
+        self.coef[2] /= self.coef[0]
+
+    def int_polynomial(self, x_int, scaling_factor):
+        with torch.no_grad():
+            b_int = torch.floor(self.coef[1] / scaling_factor)
+            c_int = torch.floor(self.coef[2] / scaling_factor**2)
+        z = (x_int + b_int) * x_int + c_int
+        scaling_factor = self.coef[0] * scaling_factor**2
+        return z, scaling_factor
+
+    def int_exp(self, x_int, scaling_factor):
+        with torch.no_grad():
+            x0_int = torch.floor(self.x0 / scaling_factor)
+        x_int = torch.max(x_int, self.const * x0_int)
+
+        q = floor_ste.apply(x_int / x0_int)
+        r = x_int - x0_int * q
+        exp_int, exp_scaling_factor = self.int_polynomial(r, scaling_factor)
+        exp_int = torch.clamp(floor_ste.apply(exp_int * 2 ** (self.const - q)), min=0)
+        scaling_factor = exp_scaling_factor / 2**self.const
+        return exp_int, scaling_factor
+
+    def forward(self, x, scaling_factor):
+        if not self.quant_mode:
+            return nn.functional.softmax(x, dim=-1), None
+
+        x_int = x / scaling_factor
+
+        x_int_max, _ = x_int.max(dim=-1, keepdim=True)
+        x_int = x_int - x_int_max
+        exp_int, exp_scaling_factor = self.int_exp(x_int, scaling_factor)
+
+        # Avoid overflow
+        exp, exp_scaling_factor = self.act(exp_int, exp_scaling_factor)
+        exp_int = exp / exp_scaling_factor
+
+        exp_int_sum = exp_int.sum(dim=-1, keepdim=True)
+        factor = floor_ste.apply(2**self.max_bit / exp_int_sum)
+        exp_int = floor_ste.apply(exp_int * factor / 2 ** (self.max_bit - self.output_bit))
+        scaling_factor = 1 / 2**self.output_bit
+        return exp_int * scaling_factor, scaling_factor
+
+
+class IntLayerNorm(nn.Module):
+    """
+    Quantized version of `torch.nn.LayerNorm`. Adds quantization-specific arguments on top of `torch.nn.LayerNorm`.
+
+    Args:
+        output_bit (`int`, *optional*, defaults to `8`):
+            Bitwidth for the layer output activation.
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+        force_dequant (`str`, *optional*, defaults to `"none"`):
+            Force dequantize the layer if either "layernorm" or "nonlinear" is given.
+    """
+
+    def __init__(self, normalized_shape, eps, output_bit=8, quant_mode=False, force_dequant="none"):
+        super().__init__()
+        self.normalized_shape = normalized_shape
+        self.eps = eps
+
+        self.weight = nn.Parameter(torch.zeros(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+
+        self.quant_mode = quant_mode
+        if force_dequant in ["nonlinear", "layernorm"]:
+            logger.info("Force dequantize layernorm")
+            self.quant_mode = False
+
+        self.register_buffer("shift", torch.zeros(1))
+        self.output_bit = output_bit
+        self.max_bit = 32
+        self.dim_sqrt = None
+        self.activation = QuantAct(self.output_bit, quant_mode=self.quant_mode)
+
+    def set_shift(self, y_int):
+        with torch.no_grad():
+            y_sq_int = y_int**2
+            var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
+            shift = (torch.log2(torch.sqrt(var_int / 2**self.max_bit)).ceil()).max()
+            shift_old = self.shift
+            self.shift = torch.max(self.shift, shift)
+            logger.info(f"Dynamic shift adjustment: {int(shift_old)} -> {int(self.shift)}")
+
+    def overflow_fallback(self, y_int):
+        """
+        This fallback function is called when overflow is detected during training time, and adjusts the `self.shift`
+        to avoid overflow in the subsequent runs.
+        """
+        self.set_shift(y_int)  # adjusts `self.shift`
+        y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
+        y_sq_int = y_int_shifted**2
+        var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
+        return var_int
+
+    def forward(self, x, scaling_factor=None):
+        if not self.quant_mode:
+            mean = x.mean(axis=2, keepdim=True)
+            y = x - mean
+            var = torch.mean(y**2, axis=2, keepdim=True)
+            x = y / torch.sqrt(self.eps + var)
+            x = x * self.weight + self.bias
+            return x, None
+
+        # compute sqrt of the feature dimension if it is the first run
+        if self.dim_sqrt is None:
+            n = torch.tensor(x.shape[2], dtype=torch.float)
+            self.dim_sqrt = torch.sqrt(n).to(x.device)
+
+        # Normalization: computes mean and variance(std)
+        x_int = x / scaling_factor
+        mean_int = round_ste.apply(x_int.mean(axis=2, keepdim=True))
+        y_int = x_int - mean_int
+        y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
+        y_sq_int = y_int_shifted**2
+        var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
+
+        # overflow handling in training time
+        if self.training:
+            # if overflow is detected
+            if var_int.max() >= 2**self.max_bit:
+                var_int = self.overflow_fallback(y_int)
+                assert var_int.max() < 2**self.max_bit + 0.1, (
+                    "Error detected in overflow handling: "
+                    "`var_int` exceeds `self.max_bit` (the maximum possible bit width)"
+                )
+
+        # To be replaced with integer-sqrt kernel that produces the same output
+        std_int = floor_ste.apply(torch.sqrt(var_int)) * 2**self.shift
+        factor = floor_ste.apply(2**31 / std_int)
+        y_int = floor_ste.apply(y_int * factor / 2)
+        scaling_factor = self.dim_sqrt / 2**30
+
+        # scaling and shifting
+        bias = self.bias.data.detach() / (self.weight.data.detach())
+        bias_int = floor_ste.apply(bias / scaling_factor)
+
+        y_int = y_int + bias_int
+        scaling_factor = scaling_factor * self.weight
+        x = y_int * scaling_factor
+
+        return x, scaling_factor
+
+
+def get_percentile_min_max(input, lower_percentile, upper_percentile, output_tensor=False):
+    """
+    Calculate the percentile max and min values in a given tensor
+
+    Args:
+        input (`torch.Tensor`):
+            The target tensor to calculate percentile max and min.
+        lower_percentile (`float`):
+            If 0.1, means we return the value of the smallest 0.1% value in the tensor as percentile min.
+        upper_percentile (`float`):
+            If 99.9, means we return the value of the largest 0.1% value in the tensor as percentile max.
+        output_tensor (`bool`, *optional*, defaults to `False`):
+            If True, this function returns tensors, otherwise it returns values.
+
+    Returns:
+        `Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of *input*
+    """
+    input_length = input.shape[0]
+
+    lower_index = round(input_length * (1 - lower_percentile * 0.01))
+    upper_index = round(input_length * upper_percentile * 0.01)
+
+    upper_bound = torch.kthvalue(input, k=upper_index).values
+
+    if lower_percentile == 0:
+        lower_bound = upper_bound * 0
+        # lower_index += 1
+    else:
+        lower_bound = -torch.kthvalue(-input, k=lower_index).values
+
+    if not output_tensor:
+        lower_bound = lower_bound.item()
+        upper_bound = upper_bound.item()
+    return lower_bound, upper_bound
+
+
+def linear_quantize(input, scale, zero_point, inplace=False):
+    """
+    Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.
+
+    Args:
+        input (`torch.Tensor`):
+            Single-precision input tensor to be quantized.
+        scale (`torch.Tensor`):
+            Scaling factor for quantization.
+        zero_pint (`torch.Tensor`):
+            Shift for quantization.
+        inplace (`bool`, *optional*, defaults to `False`):
+            Whether to compute inplace or not.
+
+    Returns:
+        `torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
+    """
+    # reshape scale and zeropoint for convolutional weights and activation
+    if len(input.shape) == 4:
+        scale = scale.view(-1, 1, 1, 1)
+        zero_point = zero_point.view(-1, 1, 1, 1)
+    # reshape scale and zeropoint for linear weights
+    elif len(input.shape) == 2:
+        scale = scale.view(-1, 1)
+        zero_point = zero_point.view(-1, 1)
+    else:
+        scale = scale.view(-1)
+        zero_point = zero_point.view(-1)
+    # quantized = float / scale + zero_point
+    if inplace:
+        input.mul_(1.0 / scale).add_(zero_point).round_()
+        return input
+    return torch.round(1.0 / scale * input + zero_point)
+
+
+def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_max, per_channel=False):
+    """
+    Compute the scaling factor with the given quantization range for symmetric quantization.
+
+    Args:
+        saturation_min (`torch.Tensor`):
+            Lower bound for quantization range.
+        saturation_max (`torch.Tensor`):
+            Upper bound for quantization range.
+        per_channel (`bool`, *optional*, defaults to `False`):
+            Whether to or not use channel-wise quantization.
+
+    Returns:
+        `torch.Tensor`: Scaling factor that linearly quantizes the given range between *saturation_min* and
+        *saturation_max*.
+    """
+    # in this part, we do not need any gradient computation,
+    # in order to enforce this, we put torch.no_grad()
+    with torch.no_grad():
+        n = 2 ** (num_bits - 1) - 1
+
+        if per_channel:
+            scale, _ = torch.max(torch.stack([saturation_min.abs(), saturation_max.abs()], dim=1), dim=1)
+            scale = torch.clamp(scale, min=1e-8) / n
+
+        else:
+            scale = max(saturation_min.abs(), saturation_max.abs())
+            scale = torch.clamp(scale, min=1e-8) / n
+
+    return scale
+
+
+class SymmetricQuantFunction(Function):
+    """
+    Class to quantize the given floating-point values using symmetric quantization with given range and bitwidth.
+    """
+
+    @staticmethod
+    def forward(ctx, x, k, percentile_mode, scale):
+        """
+        Args:
+            x (`torch.Tensor`):
+                Floating point tensor to be quantized.
+            k (`int`):
+                Quantization bitwidth.
+            percentile_mode (`bool`):
+                Whether or not to use percentile calibration.
+            scale (`torch.Tensor`):
+                Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
+                requires pre-calculated scaling factor.
+
+        Returns:
+            `torch.Tensor`: Symmetric-quantized value of *input*.
+        """
+        zero_point = torch.tensor(0.0, device=scale.device)
+
+        n = 2 ** (k - 1) - 1
+        new_quant_x = linear_quantize(x, scale, zero_point, inplace=False)
+        new_quant_x = torch.clamp(new_quant_x, -n, n - 1)
+
+        ctx.scale = scale
+        return new_quant_x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        scale = ctx.scale
+        if len(grad_output.shape) == 4:
+            scale = scale.view(-1, 1, 1, 1)
+        # reshape scale and zeropoint for linear weights
+        elif len(grad_output.shape) == 2:
+            scale = scale.view(-1, 1)
+        else:
+            scale = scale.view(-1)
+
+        return grad_output.clone() / scale, None, None, None, None
+
+
+class floor_ste(Function):
+    """
+    Straight-through Estimator(STE) for torch.floor()
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        return torch.floor(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output.clone()
+
+
+class round_ste(Function):
+    """
+    Straight-through Estimator(STE) for torch.round()
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        return torch.round(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output.clone()
+
+
+def batch_frexp(inputs, max_bit=31):
+    """
+    Decompose the scaling factor into mantissa and twos exponent.
+
+    Args:
+        scaling_factor (`torch.Tensor`):
+            Target scaling factor to decompose.
+
+    Returns:
+        ``Tuple(torch.Tensor, torch.Tensor)`: mantisa and exponent
+    """
+
+    shape_of_input = inputs.size()
+
+    # trans the input to be a 1-d tensor
+    inputs = inputs.view(-1)
+
+    output_m, output_e = np.frexp(inputs.cpu().numpy())
+    tmp_m = []
+    for m in output_m:
+        int_m_shifted = int(
+            decimal.Decimal(m * (2**max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP)
+        )
+        tmp_m.append(int_m_shifted)
+    output_m = np.array(tmp_m)
+
+    output_e = float(max_bit) - output_e
+
+    return (
+        torch.from_numpy(output_m).to(inputs.device).view(shape_of_input),
+        torch.from_numpy(output_e).to(inputs.device).view(shape_of_input),
+    )
+
+
+class FixedPointMul(Function):
+    """
+    Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.
+
+    Args:
+        pre_act (`torch.Tensor`):
+            Input tensor.
+        pre_act_scaling_factor (`torch.Tensor`):
+            Scaling factor of the input tensor *pre_act*.
+        bit_num (`int`):
+            Quantization bitwidth.
+        z_scaling_factor (`torch.Tensor`):
+            Scaling factor of the output tensor.
+        identity (`torch.Tensor`, *optional*):
+            Identity tensor, if exists.
+        identity_scaling_factor (`torch.Tensor`, *optional*):
+            Scaling factor of the identity tensor *identity*, if exists.
+
+    Returns:
+        `torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act* and
+        *identity*), whose scale is rescaled to *z_scaling_factor*.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        pre_act,
+        pre_act_scaling_factor,
+        bit_num,
+        z_scaling_factor,
+        identity=None,
+        identity_scaling_factor=None,
+    ):
+        if len(pre_act_scaling_factor.shape) == 3:
+            reshape = lambda x: x  # noqa: E731
+        else:
+            reshape = lambda x: x.view(1, 1, -1)  # noqa: E731
+        ctx.identity = identity
+
+        n = 2 ** (bit_num - 1) - 1
+
+        with torch.no_grad():
+            pre_act_scaling_factor = reshape(pre_act_scaling_factor)
+            if identity is not None:
+                identity_scaling_factor = reshape(identity_scaling_factor)
+
+            ctx.z_scaling_factor = z_scaling_factor
+
+            z_int = torch.round(pre_act / pre_act_scaling_factor)
+            _A = pre_act_scaling_factor.type(torch.double)
+            _B = (z_scaling_factor.type(torch.float)).type(torch.double)
+            new_scale = _A / _B
+            new_scale = reshape(new_scale)
+
+            m, e = batch_frexp(new_scale)
+
+            output = z_int.type(torch.double) * m.type(torch.double)
+            output = torch.round(output / (2.0**e))
+
+            if identity is not None:
+                # needs addition of identity activation
+                wx_int = torch.round(identity / identity_scaling_factor)
+
+                _A = identity_scaling_factor.type(torch.double)
+                _B = (z_scaling_factor.type(torch.float)).type(torch.double)
+                new_scale = _A / _B
+                new_scale = reshape(new_scale)
+
+                m1, e1 = batch_frexp(new_scale)
+                output1 = wx_int.type(torch.double) * m1.type(torch.double)
+                output1 = torch.round(output1 / (2.0**e1))
+
+                output = output1 + output
+
+            return torch.clamp(output.type(torch.float), -n - 1, n)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        identity_grad = None
+        if ctx.identity is not None:
+            identity_grad = grad_output.clone() / ctx.z_scaling_factor
+        return grad_output.clone() / ctx.z_scaling_factor, None, None, None, None, identity_grad, None
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__init__.py b/phivenv/Lib/site-packages/transformers/models/idefics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4adb66825445f25a1c34bd7b3b86e60eed7be85f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_idefics import *
+    from .image_processing_idefics import *
+    from .modeling_idefics import *
+    from .modeling_tf_idefics import *
+    from .processing_idefics import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9df2e34f72e1fcb2b02e8491f9900c243d6819aa
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7298457f753fbb3ff5cb409cdad37eb2f4169491
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9ee04ff335c4e68f410d182555bd3e08cda5f69
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf66423fe5379910b61f67f4f2beb38315f5bc21
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..446f06c22c9603a6a03a200f0544db08e78895b8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4124936ac05fdaed6323588ffdde1b07564eb84e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d72feb9b71b695438741d69474bc2156442be509
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b1c036cae89b3fe511175c794ea3bffc288f03e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/vision.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/vision.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c03c642544c4eb30b803e961798a36965ac5952f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/vision.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fbca5848fd6bc2862c7a4eb39ffab81e7d6ebf8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/configuration_idefics.py b/phivenv/Lib/site-packages/transformers/models/idefics/configuration_idefics.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8320b98725d0b02c85783464efe11dbb0e8f0a8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/configuration_idefics.py
@@ -0,0 +1,325 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Idefics model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class IdeficsVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        embed_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of image channels.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
+            testing).
+    """
+
+    model_type = "idefics_vision"
+    attribute_map = {
+        "hidden_size": "embed_dim",
+    }
+
+    def __init__(
+        self,
+        embed_dim=768,
+        image_size=224,
+        intermediate_size=5120,
+        patch_size=14,
+        num_hidden_layers=32,
+        num_attention_heads=16,
+        num_channels=3,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.intermediate_size = intermediate_size
+        self.patch_size = patch_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+
+        super().__init__(**kwargs)
+
+
+class IdeficsPerceiverConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_resampler (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the resampler
+        resampler_n_latents (`int`, *optional*, defaults to 64):
+            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        resampler_depth (`int`, *optional*, defaults to 6):
+            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+        resampler_n_heads (`int`, *optional*, defaults to 16):
+            Number of heads in each Transformer block (for multi-headed self-attention).
+        resampler_head_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of each head projection in the Transformer block.
+        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
+            Whether or not to use qk layer norms in perceiver
+    """
+
+    model_type = "idefics_perciever"
+
+    def __init__(
+        self,
+        use_resampler=False,
+        resampler_n_latents=64,
+        resampler_depth=6,
+        resampler_n_heads=16,
+        resampler_head_dim=96,
+        qk_layer_norms_perceiver=False,
+        **kwargs,
+    ):
+        self.use_resampler = use_resampler
+        self.resampler_n_latents = resampler_n_latents
+        self.resampler_depth = resampler_depth
+        self.resampler_n_heads = resampler_n_heads
+        self.resampler_head_dim = resampler_head_dim
+        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver
+
+        super().__init__(**kwargs)
+
+
+class IdeficsConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        additional_vocab_size (`int`, *optional*, defaults to 0):
+            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
+            are always trainable whereas regular vocab tokens can be frozen or not.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~IdeficsModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
+            Initialization type for the alphas.
+        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
+            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
+            Attention.
+        alpha_type (`str`, *optional*, defaults to `"float"`):
+            Whether the gating alphas should be vectors or single floats.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0)
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1)
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2)
+            End of stream token id.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        cross_layer_interval (`int`, *optional*, default to 1)
+            Interval for cross attention (from text to image) layers.
+        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
+        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
+        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing text layers when `freeze_text_layers` is `True`
+        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
+        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
+        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
+        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
+        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
+        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict
+
+    Example:
+
+    ```python
+    >>> from transformers import IdeficsModel, IdeficsConfig
+
+    >>> # Initializing a Idefics idefics-9b style configuration
+    >>> configuration = IdeficsConfig()
+
+    >>> # Initializing a model from the idefics-9b style configuration
+    >>> model = IdeficsModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics"
+    sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig}
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        additional_vocab_size=0,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        dropout=0.0,
+        hidden_act="silu",
+        initializer_range=0.02,
+        alpha_initializer="zeros",
+        alphas_initializer_range=0.0,
+        alpha_type="float",
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        cross_layer_interval=1,
+        qk_layer_norms=False,
+        freeze_text_layers=True,
+        freeze_text_module_exceptions=[],
+        freeze_lm_head=False,
+        freeze_vision_layers=True,
+        freeze_vision_module_exceptions=[],
+        use_resampler=False,
+        vision_config=None,
+        perceiver_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.alpha_initializer = alpha_initializer
+        self.alphas_initializer_range = alphas_initializer_range
+        self.alpha_type = alpha_type
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.cross_layer_interval = cross_layer_interval
+        self.qk_layer_norms = qk_layer_norms
+        self.freeze_vision_layers = freeze_vision_layers
+
+        self.freeze_text_layers = freeze_text_layers
+        self.freeze_text_module_exceptions = freeze_text_module_exceptions
+        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
+        self.freeze_lm_head = freeze_lm_head
+
+        self.use_resampler = use_resampler
+
+        if perceiver_config is None:
+            self.perceiver_config = IdeficsPerceiverConfig()
+        elif isinstance(perceiver_config, dict):
+            self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
+        elif isinstance(perceiver_config, IdeficsPerceiverConfig):
+            self.perceiver_config = perceiver_config
+
+        if vision_config is None:
+            self.vision_config = IdeficsVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = IdeficsVisionConfig(**vision_config)
+        elif isinstance(vision_config, IdeficsVisionConfig):
+            self.vision_config = vision_config
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
+        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
+        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
+        # of this object many attributes have default values and haven't yet been overridden.
+        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
+
+
+__all__ = ["IdeficsConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/image_processing_idefics.py b/phivenv/Lib/site-packages/transformers/models/idefics/image_processing_idefics.py
new file mode 100644
index 0000000000000000000000000000000000000000..190e1d31dc7861891d71eda0a87f5b2c51bafbbe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/image_processing_idefics.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Idefics."""
+
+from typing import Callable, Optional, Union
+
+from PIL import Image
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available
+
+
+IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
+IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+
+
+class IdeficsImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics image processor.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            Resize to image size
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        image_num_channels (`int`, *optional*, defaults to 3):
+            Number of image channels.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        image_num_channels: Optional[int] = 3,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.image_num_channels = image_num_channels
+        self.image_mean = image_mean if image_mean is not None else IDEFICS_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IDEFICS_STANDARD_STD
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_num_channels: Optional[int] = 3,
+        image_size: Optional[dict[str, int]] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        transform: Optional[Callable] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        **kwargs,
+    ) -> TensorType:
+        """
+        Preprocess a batch of images.
+
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            image_size (`int`, *optional*, defaults to `self.image_size`):
+                Resize to image size
+            image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
+                Number of image channels.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
+                be overridden by the `image_mean` parameter in the `preprocess` method.
+            image_std (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
+                method. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            transform (`Callable`, *optional*, defaults to `None`):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+                assumed - and then a preset of inference-specific transforms will be applied to the images
+            do_rescale (`bool`, *optional*, defaults to `True`):
+                Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+                the `preprocess` method.
+            rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+                Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+                method.
+
+        Returns:
+            a PyTorch tensor of the processed images
+
+        """
+        image_size = image_size if image_size is not None else self.image_size
+        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        size = (image_size, image_size)
+
+        if isinstance(images, list) and len(images) == 0:
+            return []
+
+        images = self.fetch_images(images)
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # For training a user needs to pass their own set of transforms as a Callable.
+        # For reference this is what was used in the original IDEFICS training:
+        # transform = transforms.Compose([
+        #     convert_to_rgb,
+        #     transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
+        #     transforms.ToTensor(),
+        #     transforms.Normalize(mean=image_mean, std=image_std),
+        # ])
+        if transform is not None:
+            if not is_torch_available():
+                raise ImportError("To pass in `transform` torch must be installed")
+            import torch
+
+            images = [transform(x) for x in images]
+            return torch.stack(images)
+
+        # for inference we do the exact transforms that were used to train IDEFICS
+        images = [convert_to_rgb(x) for x in images]
+        # further transforms expect numpy arrays
+        images = [to_numpy_array(x) for x in images]
+        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
+        images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
+        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
+        images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"]
+
+        return images
+
+
+__all__ = ["IdeficsImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/modeling_idefics.py b/phivenv/Lib/site-packages/transformers/models/idefics/modeling_idefics.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d449b7aaa124f334afac049420138549ee598e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/modeling_idefics.py
@@ -0,0 +1,1540 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Idefics model."""
+
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PretrainedConfig, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_idefics import IdeficsConfig
+from .perceiver import IdeficsPerceiverResampler
+from .vision import IdeficsVisionEmbeddings, IdeficsVisionTransformer
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    """
+)
+class IdeficsBaseModelOutputWithPast(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+        encoder_sequence_length, embed_size_per_head)`.
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+        input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Idefics causal language model (or autoregressive) outputs.
+    """
+)
+class IdeficsCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+def expand_inputs_for_generation(
+    input_ids,
+    expand_size=1,
+    is_encoder_decoder=False,
+    attention_mask=None,
+    encoder_outputs=None,
+    **model_kwargs,
+):
+    expanded_return_idx = (
+        torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
+    )
+    input_ids = input_ids.index_select(0, expanded_return_idx)
+    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values")
+    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings")
+    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings")
+    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask")
+
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
+
+    if attention_mask is not None:
+        model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
+
+    if model_kwargs["image_attention_mask"] is not None:
+        model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
+            0, expanded_return_idx
+        )
+
+    if model_kwargs["pixel_values"] is not None:
+        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
+
+    elif model_kwargs["image_encoder_embeddings"] is not None:
+        model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select(
+            0, expanded_return_idx
+        )
+
+    elif model_kwargs["perceiver_embeddings"] is not None:
+        model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].index_select(
+            0, expanded_return_idx
+        )
+
+    return input_ids, model_kwargs
+
+
+def freeze_model(model, module_exceptions=[]):
+    mapping = {
+        "LayerNorm": nn.LayerNorm,
+        "Linear": nn.Linear,
+        "Embedding": nn.Embedding,
+    }
+    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
+    for module in model.modules():
+        if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
+            module.requires_grad_(True)  # Explicitly setting it to true to avoid any mistakes
+        else:
+            module.requires_grad_(False)
+    return model
+
+
+class IdeficsDecoupledEmbedding(nn.Embedding):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
+    then it will create `num_additional_embeddings` additional parameters that are always trained. If
+    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        num_additional_embeddings,
+        embedding_dim,
+        partially_freeze: Optional[bool] = False,
+        device=None,
+        dtype=None,
+        padding_idx=None,
+        **kwargs,
+    ) -> None:
+        """
+        Args:
+            num_embeddings (`int`):
+                Size of the dictionary of embeddings
+            num_additional_embeddings (`int`):
+                Number of additional embeddings. Only useful when you `partially_freeze=True`.
+            embedding_dim (`int`):
+                The size of each embedding vector
+            partially_freeze: (`bool`, *optional*, defaults to `False`):
+                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
+            padding_idx (`int`, *optional*):
+                The padding index (needs to be less than num_embeddings)
+
+        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
+        `max_norm` or `norm_type`. We are not supporting these.
+        """
+        if padding_idx is not None and padding_idx > num_embeddings:
+            raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
+        super().__init__(
+            num_embeddings=num_embeddings,
+            embedding_dim=embedding_dim,
+            device=device,
+            dtype=dtype,
+            padding_idx=padding_idx,
+            **kwargs,
+        )
+        self.num_embeddings = num_embeddings
+        self.padding_idx = padding_idx
+        self.num_additional_embeddings = num_additional_embeddings
+        self.partially_freeze = partially_freeze
+
+        if partially_freeze:
+            self.weight.requires_grad_(False)
+
+        if self.num_additional_embeddings > 0:
+            self.additional_embedding = nn.Embedding(
+                num_embeddings=self.num_additional_embeddings,
+                embedding_dim=embedding_dim,
+                device=device,
+                dtype=dtype,
+            )
+
+    def forward(self, input_ids):
+        """
+        we have 2 embeddings, with different indices - one pretrained self.weight and another
+        self.additional_embedding.weight that is being trained.
+
+        in order to make a lookup of the input ids, we:
+        1. find out the indices of the entries belonging to the 2nd embedding
+        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
+           embedding starts from 0 and not num_embeddings
+        3. perform the 2nd embedding lookup
+        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
+        5. perform the 1st embedding lookup
+        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
+
+        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
+        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
+        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
+        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
+        measure.
+
+        """
+        if self.num_additional_embeddings == 0:
+            return F.embedding(input_ids, self.weight)
+
+        # Clone so that we don't modify the original input_ids later on
+        input_ids = input_ids.clone()
+        additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
+        input_ids_additional_vocab = input_ids[additional_vocab_indices]
+        additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)
+
+        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
+        input_ids[additional_vocab_indices] = 0
+        full_vector = F.embedding(input_ids, self.weight)
+
+        # overwrite the records with high indices
+        full_vector[additional_vocab_indices] = additional_embeddings
+
+        return full_vector
+
+    def extra_repr(self) -> str:
+        return f"num_embeddings={self.num_embeddings}, num_additional_embeddings={self.num_additional_embeddings}, embedding_dim={self.embedding_dim}, partially_freeze={self.partially_freeze}"
+
+
+class IdeficsDecoupledLinear(nn.Linear):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
+    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
+    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        out_additional_features: int = 0,
+        bias: bool = True,
+        partially_freeze: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
+        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
+        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
+        """
+        super().__init__(in_features, out_features, bias, device, dtype)
+        self.out_additional_features = out_additional_features
+        self.partially_freeze = partially_freeze
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        if partially_freeze:
+            self.weight.requires_grad_(False)
+            if bias:
+                self.bias.requires_grad_(False)
+
+        if out_additional_features > 0:
+            self.additional_fc = nn.Linear(
+                in_features=in_features,
+                out_features=out_additional_features,
+                bias=bias,
+                device=device,
+                dtype=dtype,
+            )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = F.linear(input, self.weight, self.bias)
+
+        if self.out_additional_features > 0:
+            additional_features = self.additional_fc(input)
+            output = torch.cat((output, additional_features), -1)
+
+        return output
+
+    def extra_repr(self) -> str:
+        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
+        return f"in_features={self.in_features}, out_features={self.out_features}, out_additional_features={self.out_additional_features}, bias={self.bias is not None}, partially_freeze={self.partially_freeze}"
+
+
+# this was adapted from LlamaRMSNorm
+class IdeficsRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        IdeficsRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# this was adapted from LlamaRotaryEmbedding
+class IdeficsEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# this was adapted from LlamaMLP
+class IdeficsMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# this was adapted from LlamaAttention
+class IdeficsAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_cross_attention: bool = False,
+        config: PretrainedConfig = None,
+        qk_layer_norms: bool = False,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.dropout = dropout
+        self.is_causal = True
+        self.scaling = self.head_dim**-0.5
+
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        if (self.head_dim * num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {num_heads})."
+            )
+
+        self.is_cross_attention = is_cross_attention
+
+        if not hasattr(nn.functional, "scaled_dot_product_attention"):
+            raise ValueError("this model requires pytorch 2.0 or higher")
+
+        if self.is_cross_attention:
+            kv_input_dim = (
+                self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
+            )
+            self.q_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+            self.k_proj = nn.Linear(kv_input_dim, num_heads * self.head_dim, bias=False)
+            self.v_proj = nn.Linear(
+                kv_input_dim,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+        else:
+            self.q_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+            self.k_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+            self.v_proj = nn.Linear(
+                self.hidden_size,
+                num_heads * self.head_dim,
+                bias=False,
+            )
+        self.o_proj = nn.Linear(
+            num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+        )
+        self.rotary_emb = IdeficsEmbedding(self.head_dim)
+
+        self.qk_layer_norms = qk_layer_norms
+        if self.qk_layer_norms:
+            self.q_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        is_cross_attention = self.is_cross_attention or key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if not is_cross_attention:
+            key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        else:
+            _, kv_len, _ = key_value_states.size()  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = self.k_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = (
+                self.v_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            )
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_values is not None:
+            kv_seq_len += cache_position[0]
+
+        if not is_cross_attention:
+            cos, sin = self.rotary_emb(value_states, seq_len=max(kv_seq_len, q_len))
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+
+        attention_interface: Callable = eager_attention_forward
+
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+# this was adapted from LlamaDecoderLayer
+class IdeficsDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: IdeficsConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = IdeficsAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.dropout,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.mlp = IdeficsMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.dropout = config.dropout
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class IdeficsGatedCrossAttentionLayer(GradientCheckpointingLayer):
+    def __init__(self, config: IdeficsConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.cross_attn = IdeficsAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            is_cross_attention=True,
+            dropout=config.dropout,
+            config=config,
+            qk_layer_norms=config.qk_layer_norms,
+            layer_idx=layer_idx,
+        )
+        self.mlp = IdeficsMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.config = config.dropout
+
+        self.act_cross_attn = nn.Tanh()
+        self.act_dense = nn.Tanh()
+
+        if config.alpha_initializer == "zeros":
+            if config.alpha_type == "vector":
+                self.alpha_cross_attn = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
+                self.alpha_dense = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
+            elif config.alpha_type == "float":
+                self.alpha_cross_attn = nn.Parameter(torch.zeros(1))
+                self.alpha_dense = nn.Parameter(torch.zeros(1))
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
+
+        elif config.alpha_initializer == "ones":
+            if config.alpha_type == "vector":
+                self.alpha_cross_attn = nn.Parameter(torch.ones(1, 1, self.hidden_size))
+                self.alpha_dense = nn.Parameter(torch.ones(1, 1, self.hidden_size))
+            elif config.alpha_type == "float":
+                self.alpha_cross_attn = nn.Parameter(torch.ones(1))
+                self.alpha_dense = nn.Parameter(torch.ones(1))
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
+
+        elif config.alpha_initializer in {"normal", "gaussian", "random"}:
+            if config.alpha_type == "vector":
+                self.alpha_cross_attn = nn.Parameter(
+                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
+                )
+                self.alpha_dense = nn.Parameter(
+                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
+                )
+            elif config.alpha_type == "float":
+                self.alpha_cross_attn = nn.Parameter(
+                    torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
+                )
+                self.alpha_dense = nn.Parameter(torch.normal(mean=0.0, std=config.alphas_initializer_range, size=(1)))
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")
+
+        else:
+            raise NotImplementedError(f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!")
+
+        if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
+            raise ValueError("Alpha parameters not initialized correctly!")
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_hidden_states: Optional[torch.Tensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_gate: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            cross_attention_gate (`torch.FloatTensor`, *optional*):
+                gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if image_hidden_states is None:
+            raise ValueError(
+                "`image_hidden_states` is required for Idefics cross attention module which are visual features to be"
+                " conditioned on."
+            )
+
+        if cross_attention_gate is None:
+            raise ValueError(
+                "`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images."
+            )
+
+        if past_key_values is not None:
+            raise NotImplementedError("Past key value states are not implemented for Idefics cross attention module.")
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.cross_attn(
+            hidden_states=hidden_states,
+            key_value_states=image_hidden_states,
+            attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
+        # Fill in zeros for cross_attention hidden_states of tokens attending to no images
+        hidden_states = hidden_states.masked_fill((cross_attention_gate == 0)[:, :, None], 0.0)
+        hidden_states = residual + self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
+        hidden_states = residual + self.act_dense(self.alpha_dense) * hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class IdeficsPreTrainedModel(PreTrainedModel):
+    config: IdeficsConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]
+    _supports_sdpa = True
+
+    _supports_flash_attn = False  # only eager/sdpa creation is supported
+    _can_compile_fullgraph = False  # IDEFICS cannot compile due to dynamic control flow when checking inputs
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Idefics isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the m4 code
+        # base should be used for training from scratch and it contains the correct code.
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, IdeficsRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, IdeficsVisionEmbeddings):
+            module.class_embedding.data.normal_()
+        elif isinstance(module, IdeficsGatedCrossAttentionLayer):
+            if self.config.alpha_initializer == "zeros":
+                module.alpha_cross_attn.data.zero_()
+                module.alpha_dense.data.zero_()
+            elif self.config.alpha_initializer == "ones":
+                module.alpha_cross_attn.data.fill_(1.0)
+                module.alpha_dense.data.fill_(1.0)
+            elif self.config.alpha_initializer in {"normal", "gaussian", "random"}:
+                module.alpha_cross_attn.data.normal_(mean=0.0, std=self.config.alphas_initializer_range)
+                module.alpha_dense.data.normal_(mean=0.0, std=self.config.alphas_initializer_range)
+        elif isinstance(module, IdeficsPerceiverResampler):
+            module.latents.data.normal_()
+
+
+@auto_docstring
+class IdeficsModel(IdeficsPreTrainedModel):
+    """
+    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]
+
+    Args:
+        config: IdeficsConfig
+    """
+
+    def __init__(self, config: IdeficsConfig):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = IdeficsDecoupledEmbedding(
+            num_embeddings=config.vocab_size,
+            num_additional_embeddings=config.additional_vocab_size,
+            embedding_dim=config.hidden_size,
+            partially_freeze=config.freeze_text_layers,
+            padding_idx=self.padding_idx,
+        )
+
+        self.image_size = config.vision_config.image_size
+        self.vision_config = config.vision_config
+        # The module using it is not a PreTrainedModel subclass so we need this
+        self.vision_config._attn_implementation = config._attn_implementation
+        self.vision_model = IdeficsVisionTransformer(config.vision_config)
+
+        # Perceiver Resampler
+        if config.use_resampler:
+            perceiver_config = config.perceiver_config
+            self.perceiver_resampler = IdeficsPerceiverResampler(
+                config,
+                config.vision_config.embed_dim,
+                perceiver_config.resampler_depth,
+                perceiver_config.resampler_n_heads,
+                perceiver_config.resampler_head_dim,
+                perceiver_config.resampler_n_latents,
+            )
+
+        self.layers = nn.ModuleList(
+            [IdeficsDecoderLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]
+        )
+
+        self.cross_layer_interval = config.cross_layer_interval
+        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [IdeficsGatedCrossAttentionLayer(config, layer_idx=i) for i in range(num_cross_layers)]
+        )
+        self.gradient_checkpointing = False
+
+        self.norm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.freeze_relevant_params(config)
+
+    def freeze_relevant_params(self, config=None):
+        if config is None:
+            config = self.config
+
+        if config.freeze_text_layers:
+            self.freeze_text_layers(config.freeze_text_module_exceptions)
+
+        if config.freeze_vision_layers:
+            freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
+
+    def freeze_text_layers(self, module_exceptions=[]):
+        for module in [self.layers, self.norm]:
+            freeze_model(module, module_exceptions=module_exceptions)
+
+    def freeze_vision_layers(self, module_exceptions=[]):
+        freeze_model(self.vision_model, module_exceptions=module_exceptions)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_encoder_embeddings: Optional[torch.FloatTensor] = None,
+        perceiver_embeddings: Optional[torch.FloatTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, IdeficsBaseModelOutputWithPast]:
+        r"""
+        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
+            The output of the image encoder.
+        perceiver_embeddings (`torch.FloatTensor`, *optional*):
+            The output of the perceiver resampler.
+        image_attention_mask (`torch.LongTensor`, *optional*):
+            The attention mask for the image encoder.
+        """
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        batch_size, seq_length, _ = inputs_embeds.shape
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        seq_length_with_past = seq_length + past_key_values_length
+
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids[:, -seq_length:]
+        elif position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        if sum([x is None for x in [pixel_values, image_encoder_embeddings, perceiver_embeddings]]) != 2:
+            raise ValueError(
+                "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None."
+            )
+
+        elif pixel_values is not None:
+            pixel_values = pixel_values.to(dtype=self.dtype, device=device)  # fp16 compatibility
+            batch_size, num_images = pixel_values.shape[:2]
+            pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
+
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(
+                pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+            ).last_hidden_state
+
+        elif image_encoder_embeddings is not None:
+            batch_size, num_images, image_seq_len, image_hidden_size = image_encoder_embeddings.size()
+            image_hidden_states = image_encoder_embeddings.to(dtype=self.dtype, device=device)
+            image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
+
+        if self.config.use_resampler:
+            if perceiver_embeddings is None:
+                perceiver_embeddings = self.perceiver_resampler(image_hidden_states)
+                image_seq_len, image_hidden_size = perceiver_embeddings.size(1), perceiver_embeddings.size(2)
+            else:
+                batch_size, num_images, image_seq_len, image_hidden_size = perceiver_embeddings.size()
+            image_hidden_states = perceiver_embeddings
+        elif perceiver_embeddings is None:
+            image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
+        else:
+            raise ValueError("If `perceiver_embeddings` are passed, use_resampler should be True")
+
+        image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
+        # # Hack to use the model in full language modeling mode
+        # image_attention_mask = torch.zeros(batch_size, seq_length, 1, dtype=torch.long, device=image_hidden_states.device)
+        # Make image_attention_mask compatible with hidden states
+        text_seq_len = image_attention_mask.size(1)
+        image_attention_mask = image_attention_mask.unsqueeze(-1)
+        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
+        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
+
+        if image_hidden_states is not None:
+            image_batch_size, image_sequence_length, _ = image_hidden_states.size()
+            image_hidden_shape = (image_batch_size, image_sequence_length)
+            if image_attention_mask is None:
+                image_attention_mask = torch.ones(image_hidden_shape, device=device)
+            image_attention_mask = self.invert_attention_mask(image_attention_mask)
+        else:
+            image_attention_mask = None
+
+        # cross_attention_gate:
+        # For any tokens attending to no images, the hidden_states coming out of the cross-attention should be zeroed-out.
+        # `image_attention_mask` has shape [bsz, 1, num_images, hidden_size] with elements equal to either 0.0 or a very negative number.
+        # If any of the elements are 0.0, then the token is attending to at least one image and the gate value is 1. Otherwise the gate value is 0.
+        # `cross_attention_gate` has shape [bsz, seq_len] with elements equal to either 0.0 or 1.0.
+        cross_attention_gate = ((((image_attention_mask == 0.0).any(dim=-1)).to(dtype=self.dtype)).squeeze(dim=1)).to(
+            device
+        )
+
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+
+        attention_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            # TODO(ls): Add cross attention values to respective lists
+            if idx % self.cross_layer_interval == 0:
+                cross_attn_block = self.gated_cross_attn_layers[idx // self.cross_layer_interval]
+                outputs = cross_attn_block(
+                    hidden_states,
+                    attention_mask,
+                    image_hidden_states,
+                    image_attention_mask=image_attention_mask,
+                    cross_attention_gate=cross_attention_gate,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    past_key_values=None,  # not implemented
+                    **kwargs,
+                )
+                hidden_states = outputs[0]
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        image_hidden_states = image_hidden_states.view(batch_size, num_images, image_seq_len, image_hidden_size)
+
+        return IdeficsBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            image_hidden_states=image_hidden_states,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config, vision_model=None):
+        super().__init__(config)
+        self.model = IdeficsModel(config)
+
+        self.lm_head = IdeficsDecoupledLinear(
+            in_features=config.hidden_size,
+            out_features=config.vocab_size,
+            out_additional_features=config.additional_vocab_size,
+            bias=False,
+            partially_freeze=config.freeze_lm_head,
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
+        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
+        """
+        output_embeddings = self.get_output_embeddings()
+        input_embeddings = self.get_input_embeddings()
+
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings.weight = input_embeddings.weight
+            if input_embeddings.num_additional_embeddings > 0:
+                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
+                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
+
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+            output_embeddings.out_features = input_embeddings.num_embeddings
+            if hasattr(output_embeddings, "out_additional_features") and hasattr(
+                input_embeddings, "num_additional_embeddings"
+            ):
+                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_encoder_embeddings: Optional[torch.FloatTensor] = None,
+        perceiver_embeddings: Optional[torch.FloatTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, IdeficsCausalLMOutputWithPast]:
+        r"""
+        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
+            The output of the image encoder.
+        perceiver_embeddings (`torch.FloatTensor`, *optional*):
+            The output of the perceiver resampler.
+        image_attention_mask (`torch.LongTensor`, *optional*):
+            The attention mask for the image encoder.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text
+
+        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
+        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")
+
+        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
+        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"
+
+        >>> prompts = [
+        ...     [
+        ...         "User:",
+        ...         dogs_image_url_1,
+        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
+        ...         "User:",
+        ...         dogs_image_url_2,
+        ...         "Describe this image.\nAssistant:",
+        ...     ]
+        ... ]
+        >>> inputs = processor(prompts, return_tensors="pt")
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_encoder_embeddings=image_encoder_embeddings,
+            perceiver_embeddings=perceiver_embeddings,
+            image_attention_mask=image_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return IdeficsCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values=None,
+        cache_position=None,
+        pixel_values=None,
+        image_hidden_states=None,
+        image_attention_mask=None,
+        use_cache=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom processing based on `config.use_resampler`
+
+        images_kwargs = {}
+        if image_hidden_states is not None:
+            if self.config.use_resampler:
+                images_kwargs["perceiver_embeddings"] = image_hidden_states
+            else:
+                images_kwargs["image_encoder_embeddings"] = image_hidden_states
+        else:
+            images_kwargs["pixel_values"] = pixel_values
+        images_kwargs["interpolate_pos_encoding"] = kwargs.pop("interpolate_pos_encoding", False)
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            image_attention_mask=image_attention_mask,
+            **images_kwargs,
+            **kwargs,
+        )
+
+        if image_attention_mask is not None and inputs_embeds is None:
+            seq_length = model_inputs["input_ids"].shape[1]
+            model_inputs["image_attention_mask"] = image_attention_mask[:, -seq_length:]
+
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: dict[str, Any],
+        is_encoder_decoder: bool = False,
+        **kwargs,
+    ) -> dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs,
+            model_kwargs,
+            is_encoder_decoder,
+            **kwargs,
+        )
+
+        if "image_attention_mask" in model_kwargs:
+            image_attention_mask = model_kwargs["image_attention_mask"]
+            last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
+            if model_kwargs.get("use_cache", True):
+                model_kwargs["image_attention_mask"] = last_mask
+            else:
+                model_kwargs["image_attention_mask"] = torch.cat([image_attention_mask, last_mask], dim=1)
+
+        # Get the precomputed image_hidden_states
+        model_kwargs["image_hidden_states"] = outputs.image_hidden_states
+        return model_kwargs
+
+
+__all__ = ["IdeficsForVisionText2Text", "IdeficsModel", "IdeficsPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/modeling_tf_idefics.py b/phivenv/Lib/site-packages/transformers/models/idefics/modeling_tf_idefics.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e8e75be28f89eb70f61e874b52c458a78e41de7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/modeling_tf_idefics.py
@@ -0,0 +1,1778 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Idefics model."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import tensorflow as tf
+
+from ... import TFPreTrainedModel
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import ModelOutput
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    keras_serializable,
+    shape_list,
+    unpack_inputs,
+)
+from ...tf_utils import invert_attention_mask, scaled_dot_product_attention
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_idefics import IdeficsConfig
+from .perceiver_tf import TFIdeficsPerceiverResampler
+from .vision_tf import TFIdeficsVisionTransformer
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "IdeficsConfig"
+
+
+@dataclass
+class TFIdeficsBaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(tf.Tensor)`, *optional*):
+            Tuple of `tf.Tensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    last_hidden_state: tf.Tensor | None = None
+    past_key_values: tuple[tuple[tf.Tensor]] | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+    image_hidden_states: tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFIdeficsCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Idefics causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(tf.Tensor)`, *optional*):
+            Tuple of `tf.Tensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: tf.Tensor | None = None
+    logits: tf.Tensor | None = None
+    past_key_values: list[tf.Tensor] | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+    image_hidden_states: tuple[tf.Tensor] | None = None
+
+
+def expand_inputs_for_generation(
+    input_ids,
+    expand_size=1,
+    is_encoder_decoder=False,
+    attention_mask=None,
+    encoder_outputs=None,
+    **model_kwargs,
+):
+    expanded_return_idx = tf.reshape(tf.repeat(tf.range(tf.shape(input_ids)[0]), expand_size), [-1])
+    input_ids = tf.gather(input_ids, expanded_return_idx)
+    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values")
+    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings")
+    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings")
+    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask")
+
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = tf.gather(token_type_ids, expanded_return_idx)
+
+    if attention_mask is not None:
+        model_kwargs["attention_mask"] = tf.gather(attention_mask, expanded_return_idx)
+
+    if model_kwargs["image_attention_mask"] is not None:
+        model_kwargs["image_attention_mask"] = tf.gather(model_kwargs["image_attention_mask"], expanded_return_idx)
+
+    if model_kwargs["pixel_values"] is not None:
+        model_kwargs["pixel_values"] = tf.gather(model_kwargs["pixel_values"], expanded_return_idx)
+
+    elif model_kwargs["image_encoder_embeddings"] is not None:
+        model_kwargs["image_encoder_embeddings"] = tf.gather(
+            model_kwargs["image_encoder_embeddings"], expanded_return_idx
+        )
+
+    elif model_kwargs["perceiver_embeddings"] is not None:
+        model_kwargs["perceiver_embeddings"] = tf.gather(model_kwargs["perceiver_embeddings"], expanded_return_idx)
+
+    return input_ids, model_kwargs
+
+
+def update_model_kwargs_for_generation(outputs, model_kwargs):
+    # must have this key set to at least None
+    if "past_key_values" in outputs:
+        model_kwargs["past_key_values"] = outputs.past_key_values
+    else:
+        model_kwargs["past_key_values"] = None
+
+    # update token_type_ids with last value
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = tf.concat([token_type_ids, token_type_ids[:, -1:, ...]], axis=-1)
+
+    # update attention masks
+    if "attention_mask" in model_kwargs:
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = tf.concat(
+            [attention_mask, tf.ones_like(attention_mask[:, -1:, ...])], axis=-1
+        )
+    if "image_attention_mask" in model_kwargs:
+        image_attention_mask = model_kwargs["image_attention_mask"]
+        last_mask = image_attention_mask[:, -1:, ...]
+        model_kwargs["image_attention_mask"] = last_mask
+
+    # Get the precomputed image_hidden_states
+    model_kwargs["image_hidden_states"] = outputs.image_hidden_states
+
+    return model_kwargs
+
+
+def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
+    token_type_ids = kwargs.get("token_type_ids")
+    # only last token for inputs_ids if past is defined in kwargs
+    if past_key_values is not None:
+        input_ids = input_ids[:, -1:]
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, -1:]
+
+    attention_mask = kwargs.get("attention_mask")
+    position_ids = kwargs.get("position_ids")
+
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = tf.math.cumsum(tf.cast(attention_mask, dtype=tf.int64), axis=-1) - 1
+        position_ids = tf.where(attention_mask == 0, 1, position_ids)
+        if past_key_values is not None:
+            position_ids = position_ids[:, -1:]
+
+    pixel_values = kwargs.get("pixel_values")
+    image_encoder_embeddings = kwargs.get("image_encoder_embeddings")
+    perceiver_embeddings = kwargs.get("perceiver_embeddings")
+    image_attention_mask = kwargs.get("image_attention_mask")
+    interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)
+
+    return {
+        "input_ids": input_ids,
+        "past_key_values": past_key_values,
+        "use_cache": kwargs.get("use_cache"),
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+        "token_type_ids": token_type_ids,
+        "pixel_values": pixel_values,
+        "image_encoder_embeddings": image_encoder_embeddings,
+        "perceiver_embeddings": perceiver_embeddings,
+        "image_attention_mask": image_attention_mask,
+        "interpolate_pos_encoding": interpolate_pos_encoding,
+    }
+
+
+def freeze_model(model, module_exceptions=[]):
+    mapping = {
+        "LayerNorm": tf.keras.layers.LayerNormalization,
+        "Dense": tf.keras.layers.Dense,
+        "Embedding": tf.keras.layers.Embedding,
+    }
+    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
+    if not hasattr(model, "layers"):
+        model.trainable = False  # It is just a layer
+        return model
+    for layer in model.layers:
+        if module_exceptions and any(isinstance(layer, t) for t in module_exceptions_mapped):
+            layer.trainable = True  # Explicitly setting it to true to avoid any mistakes
+        else:
+            layer.trainable = False
+    return model
+
+
+class TFIdeficsDecoupledEmbedding(tf.keras.layers.Embedding):
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
+    then it will create `num_additional_embeddings` additional parameters that are always trained. If
+    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `tf.keras.layers.Embedding`.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        num_additional_embeddings,
+        embedding_dim,
+        partially_freeze: bool | None = False,
+        dtype=None,
+        **kwargs,
+    ) -> None:
+        """
+        Args:
+            num_embeddings (`int`):
+                Size of the dictionary of embeddings
+            num_additional_embeddings (`int`):
+                Number of additional embeddings. Only useful when you `partially_freeze=True`.
+            embedding_dim (`int`):
+                The size of each embedding vector
+            partially_freeze: (`bool`, *optional*, defaults to `False`):
+                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
+
+        Note: there are a lot of other parameters to initialize a standard `tf.keras.layers.Embedding` such as `mask_zero`,
+        `input_length` or `embeddings_initializer`. We are not supporting these.
+        """
+        super().__init__(
+            input_dim=num_embeddings,
+            output_dim=embedding_dim,
+            dtype=dtype,
+            **kwargs,
+        )
+        self.num_embeddings = num_embeddings
+        self.num_additional_embeddings = num_additional_embeddings
+        self.partially_freeze = partially_freeze
+
+        if partially_freeze:
+            self.trainable = False
+
+        if self.num_additional_embeddings > 0:
+            self.additional_embedding = tf.keras.layers.Embedding(
+                input_dim=self.num_additional_embeddings,
+                output_dim=embedding_dim,
+                dtype=dtype,
+                name="additional_embedding",
+            )
+
+    def call(self, input_ids):
+        """
+        we have 2 embeddings, with different indices - one pretrained self.weight and another
+        self.additional_embedding.weight that is being trained.
+
+        in order to make a lookup of the input ids, we:
+        1. find out the indices of the entries belonging to the 2nd embedding
+        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
+           embedding starts from 0 and not num_embeddings
+        3. perform the 2nd embedding lookup
+        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
+        5. perform the 1st embedding lookup
+        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
+
+        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
+        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
+        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
+        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
+        measure.
+
+        """
+        if self.num_additional_embeddings == 0:
+            return super().call(input_ids)
+
+        # Clone so that we don't modify the original input_ids later on
+        input_ids = tf.identity(input_ids)
+        additional_vocab_indices = tf.where(input_ids >= self.num_embeddings)
+        input_ids_additional_vocab = tf.gather_nd(input_ids, additional_vocab_indices)
+        additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)
+
+        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
+        input_ids = tf.tensor_scatter_nd_update(
+            input_ids,
+            additional_vocab_indices,
+            # tensor filled with 0, having the same length as additional_vocab_indices
+            tf.zeros(tf.shape(additional_vocab_indices)[0], dtype=input_ids.dtype),
+        )
+        full_vector = super().call(input_ids)
+
+        # overwrite the records with high indices
+        full_vector = tf.tensor_scatter_nd_update(full_vector, additional_vocab_indices, additional_embeddings)
+
+        return full_vector
+
+    def extra_repr(self) -> str:
+        return f"num_embeddings={self.num_embeddings}, num_additional_embeddings={self.num_additional_embeddings}, embedding_dim={self.output_dim}, partially_freeze={self.partially_freeze}"
+
+
+class TFIdeficsDecoupledLinear(tf.keras.layers.Layer):
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
+    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
+    `out_additional_features=0`, then the module defaults back to the regular behavior of `tf.keras.layers.Dense`.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        out_additional_features: int = 0,
+        bias: bool = True,
+        partially_freeze: bool = True,
+        **kwargs,
+    ) -> None:
+        """
+        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
+        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
+        parameters (if any) will be trainable. If False, default to the regular behavior of tf.keras.layers.Dense.
+        """
+        super().__init__(**kwargs)
+        self.out_additional_features = out_additional_features
+        self.partially_freeze = partially_freeze
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+
+        if out_additional_features > 0:
+            self.additional_fc = tf.keras.layers.Dense(
+                units=out_additional_features, use_bias=bias, name="additional_fc"
+            )
+
+    def call(self, inputs: tf.Tensor) -> tf.Tensor:
+        output = tf.linalg.matmul(a=inputs, b=self.weight, transpose_b=True)
+        if self.bias is not None:
+            output = tf.nn.bias_add(output, self.bias)
+
+        if self.out_additional_features > 0:
+            additional_features = self.additional_fc(inputs)
+            output = tf.concat([output, additional_features], axis=-1)
+
+        return output
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "in_features": self.in_features,
+                "out_features": self.out_features,
+                "out_additional_features": self.out_additional_features,
+                "bias": self.bias is not None,
+                "partially_freeze": self.partially_freeze,
+            }
+        )
+        return config
+
+    def extra_repr(self) -> str:
+        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
+        return f"in_features={self.in_features}, out_features={self.out_features}, out_additional_features={self.out_additional_features}, bias={self.bias is not None}, partially_freeze={self.partially_freeze}"
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        self.weight = self.add_weight(
+            shape=(self.out_features, self.in_features), trainable=not self.partially_freeze, name="weight"
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.out_features,), trainable=not self.partially_freeze, name="bias")
+        else:
+            self.bias = None
+        if getattr(self, "additional_fc", None) is not None:
+            with tf.name_scope(self.additional_fc.name):
+                self.additional_fc.build(self.in_features)
+
+
+def _make_causal_mask(input_ids_shape, dtype, past_key_values_length=0):
+    """
+    Make causal mask used for bi-directional self-attention, supporting both static and dynamic shapes.
+    """
+    bsz, tgt_len = input_ids_shape
+
+    # Create a matrix where only the lower triangle and diagonal are filled with zeros (causal mask)
+    mask = tf.fill((tgt_len, tgt_len), tf.dtypes.as_dtype(dtype).min)
+    mask_cond = tf.range(tgt_len)
+    mask = tf.where(mask_cond[:, None] >= mask_cond[None, :], 0.0, mask)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=dtype), mask], axis=-1)
+
+    if bsz is None:
+        # When batch size is dynamic, expand and tile
+        # so we can compile a functional model
+        mask = tf.expand_dims(mask, 0)
+        mask = tf.expand_dims(mask, 0)  # shape: (1, 1, tgt_len, tgt_len + past_key_values_length)
+        mask = tf.tile(mask, [bsz, 1, 1, 1])
+    else:
+        # When batch size is static, directly use broadcast_to
+        mask = tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length))
+
+    return mask
+
+
+def _expand_mask(mask, dtype, tgt_len=None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = shape_list(mask)
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = tf.expand_dims(tf.expand_dims(mask, 1), 1)
+    expanded_mask = tf.broadcast_to(expanded_mask, [bsz, 1, tgt_len, src_len])
+
+    inverted_mask = 1.0 - tf.cast(expanded_mask, dtype)
+
+    return tf.where(
+        tf.cast(inverted_mask, bool), tf.fill(dims=shape_list(inverted_mask), value=tf.float32.min), inverted_mask
+    )
+
+
+class TFIdeficsRMSNorm(tf.keras.layers.Layer):
+    def __init__(self, hidden_size, eps=1e-6, **kwargs):
+        """
+        TFIdeficsRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.variance_epsilon = eps
+
+    def build(self, input_shape):
+        if self.built:
+            return
+        self.built = True
+        self.weight = self.add_weight(name="weight", shape=[self.hidden_size], initializer="ones")
+
+        super().build(input_shape)
+
+    def call(self, hidden_states):
+        variance = tf.math.reduce_mean(tf.math.square(tf.cast(hidden_states, tf.float32)), axis=-1, keepdims=True)
+        hidden_states = hidden_states * tf.math.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [tf.float16, tf.bfloat16]:
+            hidden_states = tf.cast(hidden_states, self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class TFIdeficsEmbedding(tf.keras.layers.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.inv_freq = tf.constant(
+            1.0 / (self.base ** (tf.range(start=0, limit=self.dim, delta=2, dtype=tf.float32) / self.dim))
+        )
+
+    def _compute_cos_sin(self, seq_len):
+        t = tf.range(seq_len, dtype=self.inv_freq.dtype)
+        freqs = tf.einsum("i, j -> ij", t, self.inv_freq)  # Outer multiplication
+        emb = tf.concat((freqs, freqs), axis=-1)
+
+        return tf.cos(emb), tf.sin(emb)
+
+    def call(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len is None:
+            seq_len = shape_list(x)[2]
+        return self._compute_cos_sin(seq_len=seq_len)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return tf.concat((-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = tf.gather(cos, position_ids)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = tf.gather(sin, position_ids)
+    cos = tf.expand_dims(cos, 1)
+    sin = tf.expand_dims(sin, 1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class TFIdeficsMLP(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.gate_proj = tf.keras.layers.Dense(intermediate_size, use_bias=False, name="gate_proj")
+        self.down_proj = tf.keras.layers.Dense(hidden_size, use_bias=False, name="down_proj")
+        self.up_proj = tf.keras.layers.Dense(intermediate_size, use_bias=False, name="up_proj")
+        self.act_fn = get_tf_activation(hidden_act)
+        self.intermediate_size = intermediate_size
+        self.hidden_size = hidden_size
+
+    def call(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "gate_proj", None) is not None:
+            with tf.name_scope(self.gate_proj.name):
+                self.gate_proj.build(self.hidden_size)
+        if getattr(self, "down_proj", None) is not None:
+            with tf.name_scope(self.down_proj.name):
+                self.down_proj.build(self.intermediate_size)
+        if getattr(self, "up_proj", None) is not None:
+            with tf.name_scope(self.up_proj.name):
+                self.up_proj.build(self.hidden_size)
+
+
+class TFIdeficsAttention(tf.keras.layers.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_cross_attention: bool = False,
+        config: IdeficsConfig = None,
+        qk_layer_norms: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.dropout = dropout
+        self.config = config
+        self.is_causal = True
+
+        if (self.head_dim * num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {num_heads})."
+            )
+
+        self.is_cross_attention = is_cross_attention
+
+        self.q_proj = tf.keras.layers.Dense(
+            num_heads * self.head_dim,
+            use_bias=False,
+            name="q_proj",
+        )
+        self.k_proj = tf.keras.layers.Dense(
+            num_heads * self.head_dim,
+            use_bias=False,
+            name="k_proj",
+        )
+        self.v_proj = tf.keras.layers.Dense(
+            num_heads * self.head_dim,
+            use_bias=False,
+            name="v_proj",
+        )
+        self.o_proj = tf.keras.layers.Dense(
+            hidden_size,
+            use_bias=False,
+            name="o_proj",
+        )
+        self.rotary_emb = TFIdeficsEmbedding(self.head_dim, name="rotary_emb")
+
+        self.qk_layer_norms = qk_layer_norms
+        if self.qk_layer_norms:
+            self.q_layer_norm = TFIdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps, name="q_layer_norm")
+            self.k_layer_norm = TFIdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps, name="k_layer_norm")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        past_key_value: tuple[tf.Tensor] | None = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> tuple[tf.Tensor, tf.Tensor | None, tuple[tf.Tensor] | None]:
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        is_cross_attention = self.is_cross_attention or key_value_states is not None
+
+        bsz, q_len, _ = shape_list(hidden_states)
+
+        query_states = self._shape(self.q_proj(hidden_states), q_len, bsz)
+        if not is_cross_attention:
+            key_states = self._shape(self.k_proj(hidden_states), q_len, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), q_len, bsz)
+        else:
+            _, kv_len, _ = shape_list(key_value_states)  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = self._shape(self.k_proj(key_value_states), kv_len, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), kv_len, bsz)
+
+        kv_seq_len = shape_list(key_states)[-2]
+        if past_key_value is not None:
+            kv_seq_len += shape_list(past_key_value[0])[-2]
+        if not is_cross_attention:
+            # Below is to allow symbolic tensors compilation
+            if tf.is_tensor(kv_seq_len):
+                seq_len = tf.reduce_max(kv_seq_len, q_len)
+            else:
+                seq_len = max(kv_seq_len, q_len)
+            cos, sin = self.rotary_emb(value_states, seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+
+        tf.debugging.assert_equal(
+            tf.shape(attention_mask),
+            [bsz, 1, q_len, kv_seq_len],
+            message=f"Attention weights should be of size {[bsz, 1, q_len, kv_seq_len]}, but is {tf.shape(attention_mask)}",
+        )
+
+        attn_output = scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        tf.debugging.assert_equal(
+            tf.shape(attn_output),
+            [bsz, self.num_heads, q_len, self.head_dim],
+            message=f"Attention weights should be of size {[bsz, self.num_heads, q_len, self.head_dim]}, but is {tf.shape(attn_output)}",
+        )
+
+        attn_output = tf.reshape(tf.transpose(attn_output, perm=[0, 2, 1, 3]), (bsz, q_len, self.hidden_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        attn_weights = None
+        if output_attentions:
+            logger.warning_once(
+                "attn_weights are not extracted in scaled_dot_product_attention. The model returns None instead"
+            )
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if self.is_cross_attention:
+            kv_input_dim = (
+                self.hidden_size
+                if not hasattr(self.config.vision_config, "embed_dim")
+                else self.config.vision_config.embed_dim
+            )
+        else:
+            kv_input_dim = self.hidden_size
+        if getattr(self, "o_proj", None) is not None:
+            with tf.name_scope(self.o_proj.name):
+                self.o_proj.build(self.num_heads * self.head_dim)
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build(self.hidden_size)
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build(kv_input_dim)
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build(kv_input_dim)
+        if getattr(self, "rotary_emb", None) is not None:
+            with tf.name_scope(self.rotary_emb.name):
+                self.rotary_emb.build(None)
+
+
+class TFIdeficsDecoderLayer(tf.keras.layers.Layer):
+    def __init__(self, config: IdeficsConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_size = config.hidden_size
+        self.self_attn = TFIdeficsAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.dropout,
+            config=config,
+            name="self_attn",
+        )
+        self.mlp = TFIdeficsMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            name="mlp",
+        )
+        self.input_layernorm = TFIdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="input_layernorm")
+        self.post_attention_layernorm = TFIdeficsRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, name="post_attention_layernorm"
+        )
+        self.dropout = config.dropout
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        past_key_value: tuple[tf.Tensor] | None = None,
+        output_attentions: bool | None = False,
+        use_cache: bool | None = False,
+        training=False,
+    ) -> tuple[tf.Tensor, tuple[tf.Tensor, tf.Tensor] | None]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(tf.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = tf.nn.dropout(hidden_states, rate=self.dropout)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = tf.nn.dropout(hidden_states, rate=self.dropout)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "input_layernorm", None) is not None:
+            with tf.name_scope(self.input_layernorm.name):
+                self.input_layernorm.build(None)
+        if getattr(self, "post_attention_layernorm", None) is not None:
+            with tf.name_scope(self.post_attention_layernorm.name):
+                self.post_attention_layernorm.build(None)
+
+
+class TFIdeficsGatedCrossAttentionLayer(tf.keras.layers.Layer):
+    def __init__(self, config: IdeficsConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_size = config.hidden_size
+        self.cross_attn = TFIdeficsAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            is_cross_attention=True,
+            dropout=config.dropout,
+            config=config,
+            qk_layer_norms=config.qk_layer_norms,
+            name="cross_attn",
+        )
+        self.mlp = TFIdeficsMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            name="mlp",
+        )
+        self.input_layernorm = TFIdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="input_layernorm")
+        self.post_attention_layernorm = TFIdeficsRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, name="post_attention_layernorm"
+        )
+        self.config = config.dropout
+
+        self.act_cross_attn = tf.keras.activations.tanh
+        self.act_dense = tf.keras.activations.tanh
+
+        self.alpha_initializer = config.alpha_initializer
+        self.alpha_type = config.alpha_type
+        self.alphas_initializer_range = config.alphas_initializer_range
+
+    def build(self, input_shape):
+        if self.built:
+            return
+        self.built = True
+        if self.alpha_initializer == "zeros":
+            if self.alpha_type == "vector":
+                self.alpha_cross_attn = self.add_weight(
+                    shape=(1, 1, self.hidden_size), initializer="zeros", trainable=True, name="alpha_cross_attn"
+                )
+                self.alpha_dense = self.add_weight(
+                    shape=(1, 1, self.hidden_size), initializer="zeros", trainable=True, name="alpha_dense"
+                )
+            elif self.alpha_type == "float":
+                self.alpha_cross_attn = self.add_weight(
+                    shape=(1,), initializer="zeros", trainable=True, name="alpha_cross_attn"
+                )
+                self.alpha_dense = self.add_weight(shape=(1,), initializer="zeros", trainable=True, name="alpha_dense")
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({self.alpha_type})")
+
+        elif self.alpha_initializer == "ones":
+            if self.alpha_type == "vector":
+                self.alpha_cross_attn = self.add_weight(
+                    shape=(1, 1, self.hidden_size), initializer="ones", trainable=True, name="alpha_cross_attn"
+                )
+                self.alpha_dense = self.add_weight(
+                    shape=(1, 1, self.hidden_size), initializer="ones", trainable=True, name="alpha_dense"
+                )
+            elif self.alpha_type == "float":
+                self.alpha_cross_attn = self.add_weight(
+                    shape=(1,), initializer="ones", trainable=True, name="alpha_cross_attn"
+                )
+                self.alpha_dense = self.add_weight(shape=(1,), initializer="ones", trainable=True, name="alpha_dense")
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({self.alpha_type})")
+
+        elif self.alpha_initializer in {"normal", "gaussian", "random"}:
+            if self.alpha_type == "vector":
+                self.alpha_cross_attn = self.add_weight(
+                    shape=(1, 1, self.hidden_size),
+                    initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.alphas_initializer_range),
+                    trainable=True,
+                    name="alpha_cross_attn",
+                )
+                self.alpha_dense = self.add_weight(
+                    shape=(1, 1, self.hidden_size),
+                    initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.alphas_initializer_range),
+                    trainable=True,
+                    name="alpha_dense",
+                )
+            elif self.alpha_type == "float":
+                self.alpha_cross_attn = self.add_weight(
+                    shape=(1,),
+                    initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.alphas_initializer_range),
+                    trainable=True,
+                    name="alpha_type",
+                )
+                self.alpha_dense = self.add_weight(
+                    shape=(1,),
+                    initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.alphas_initializer_range),
+                    trainable=True,
+                    name="alpha_dense",
+                )
+            else:
+                raise ValueError(f"Unknown value for `alpha_type` ({self.alpha_type})")
+
+        else:
+            raise NotImplementedError(f"Alpha initialization scheme {self.alpha_initializer} not yet implemented!")
+
+        if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
+            raise ValueError("Alpha parameters not initialized correctly!")
+        with tf.name_scope(self.cross_attn.name):
+            self.cross_attn.build(None)
+        with tf.name_scope(self.mlp.name):
+            self.mlp.build(None)
+        with tf.name_scope(self.input_layernorm.name):
+            self.input_layernorm.build(None)
+        with tf.name_scope(self.post_attention_layernorm.name):
+            self.post_attention_layernorm.build(None)
+        super().build(input_shape)
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        image_hidden_states: tf.Tensor | None = None,
+        image_attention_mask: tf.Tensor | None = None,
+        cross_attention_gate: tf.Tensor | None = None,
+        output_attentions: bool | None = False,
+        use_cache: bool | None = False,
+        past_key_value: tuple[tf.Tensor] | None = None,
+    ) -> tuple[tf.Tensor, tuple[tf.Tensor, tf.Tensor] | None]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(tf.Tensor)`, *optional*): cached past key and value projection states
+            no_images (`bool`, *optional*, defaults to `False`): If `True` the vision part is ignored
+        """
+        if image_hidden_states is None:
+            raise ValueError(
+                "`image_hidden_states` is required for Idefics cross attention module which are visual features to be"
+                " conditioned on."
+            )
+
+        if cross_attention_gate is None:
+            raise ValueError(
+                "`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images."
+            )
+
+        if past_key_value is not None:
+            raise NotImplementedError("Past key value states are not implemented for Idefics cross attention module.")
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.cross_attn(
+            hidden_states=hidden_states,
+            key_value_states=image_hidden_states,
+            attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = tf.nn.dropout(hidden_states, rate=self.config)
+        mask = tf.cast(cross_attention_gate == 0, dtype=hidden_states.dtype)
+        # Expand dimensions of mask to match hidden_states
+        mask = tf.expand_dims(mask, -1)
+        hidden_states = tf.where(
+            tf.broadcast_to(mask, tf.shape(hidden_states)) == 1, tf.zeros_like(hidden_states), hidden_states
+        )
+        # when there are no images the model is used in pure language mode
+        # gate = 0 if no_images else 1
+        hidden_states = residual + self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = tf.nn.dropout(hidden_states, rate=self.config)
+        hidden_states = residual + self.act_dense(self.alpha_dense) * hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a TensorFlow [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) subclass.
+    Use it as a regular TensorFlow Layer and refer to the TensorFlow documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`IdeficsConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class TFIdeficsPreTrainedModel(TFPreTrainedModel):
+    config_class = IdeficsConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["TFIdeficsDecoderLayer", "TFIdeficsGatedCrossAttentionLayer"]
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+@keras_serializable
+class TFIdeficsMainLayer(tf.keras.layers.Layer):
+    """
+    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]
+
+    Args:
+        config: IdeficsConfig
+    """
+
+    config_class = IdeficsConfig
+
+    def __init__(self, config: IdeficsConfig, add_pooling_year: bool = True, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = TFIdeficsDecoupledEmbedding(
+            num_embeddings=config.vocab_size,
+            num_additional_embeddings=config.additional_vocab_size,
+            embedding_dim=config.hidden_size,
+            partially_freeze=config.freeze_text_layers,
+            name="embed_tokens",
+        )
+
+        self.image_size = config.vision_config.image_size
+        self.vision_config = config.vision_config
+        self.vision_model = TFIdeficsVisionTransformer(config.vision_config, name="vision_model")
+
+        # Perceiver Resampler
+        if config.use_resampler:
+            perceiver_config = config.perceiver_config
+            self.perceiver_resampler = TFIdeficsPerceiverResampler(
+                config,
+                config.vision_config.embed_dim,
+                perceiver_config.resampler_depth,
+                perceiver_config.resampler_n_heads,
+                perceiver_config.resampler_head_dim,
+                perceiver_config.resampler_n_latents,
+                name="perceiver_resampler",
+            )
+
+        self.decoder_layers = [
+            TFIdeficsDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)
+        ]
+
+        self.cross_layer_interval = config.cross_layer_interval
+        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
+        self.gated_cross_attn_layers = [
+            TFIdeficsGatedCrossAttentionLayer(config, name=f"gated_cross_attn_layers.{i}")
+            for i in range(num_cross_layers)
+        ]
+        self.gradient_checkpointing = False
+
+        self.norm = TFIdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="norm")
+
+        self.gradient_checkpointing = False
+        self.freeze_relevant_params(config)
+
+    def freeze_relevant_params(self, config=None):
+        if config is None:
+            config = self.config
+
+        if config.freeze_text_layers:
+            self.freeze_text_layers(config.freeze_text_module_exceptions)
+
+        if config.freeze_vision_layers:
+            freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
+
+    def freeze_text_layers(self, module_exceptions=[]):
+        for module in [self.decoder_layers, self.norm]:
+            freeze_model(module, module_exceptions=module_exceptions)
+
+    def freeze_vision_layers(self, module_exceptions=[]):
+        freeze_model(self.vision_model, module_exceptions=module_exceptions)
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        # if input_shape[-1] > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        past_key_values: list[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        image_encoder_embeddings: tf.Tensor | None = None,
+        perceiver_embeddings: tf.Tensor | None = None,
+        image_attention_mask: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        interpolate_pos_encoding: bool | None = False,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> TFIdeficsBaseModelOutputWithPast | tuple[tf.Tensor]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = shape_list(inputs_embeds)
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = shape_list(past_key_values[0][0])[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = tf.math.cumsum(tf.cast(attention_mask, dtype=tf.int32), axis=-1) - 1
+            position_ids = tf.where(attention_mask == 0, 1, position_ids)
+        elif position_ids is None:
+            position_ids = tf.range(past_key_values_length, seq_length + past_key_values_length, dtype=tf.int32)
+            position_ids = tf.expand_dims(position_ids, 0)
+
+        no_images = False
+        if (
+            sum((int(pixel_values is None), int(image_encoder_embeddings is None), int(perceiver_embeddings is None)))
+            != 2
+        ):
+            raise ValueError(
+                "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None."
+            )
+
+        elif pixel_values is not None:
+            no_images = tf.reduce_sum(tf.cast(pixel_values, dtype=tf.int32)) == 0
+            pixel_values = tf.cast(pixel_values, dtype=self.dtype)  # fp16 compatibility
+            # Below hack is because when cross-loading pytorch weights, there is an
+            # initial forward pass with dummy input and code below is here to handle that
+            if len(pixel_values.shape) == 4:
+                batch_size = shape_list(pixel_values)[0]
+                num_images = shape_list(pixel_values)[0]
+                # pixel_values = tf.reshape(pixel_values, [batch_size * num_images, *pixel_values.shape[1:]])
+            elif len(pixel_values.shape) == 5:
+                batch_size, num_images = shape_list(pixel_values)[:2]
+                pixel_values = tf.reshape(pixel_values, [batch_size * num_images, *pixel_values.shape[2:]])
+
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(
+                pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+            ).last_hidden_state
+
+        elif image_encoder_embeddings is not None:
+            batch_size, num_images, image_seq_len, image_hidden_size = shape_list(image_encoder_embeddings)
+            image_hidden_states = tf.cast(image_encoder_embeddings, dtype=self.dtype)
+            image_hidden_states = tf.reshape(
+                image_hidden_states, (batch_size * num_images, image_seq_len, image_hidden_size)
+            )
+
+        if self.config.use_resampler:
+            if perceiver_embeddings is None:
+                perceiver_embeddings = self.perceiver_resampler(image_hidden_states)
+                image_seq_len, image_hidden_size = shape_list(perceiver_embeddings)[1:3]
+            else:
+                batch_size, num_images, image_seq_len, image_hidden_size = shape_list(perceiver_embeddings)
+            image_hidden_states = perceiver_embeddings
+        elif perceiver_embeddings is None:
+            image_seq_len, image_hidden_size = shape_list(image_hidden_states)[1:3]
+        else:
+            raise ValueError("If `perceiver_embeddings` are passed, use_resampler should be True")
+
+        image_hidden_states = tf.reshape(
+            image_hidden_states, (batch_size, num_images * image_seq_len, image_hidden_size)
+        )
+        # # Hack to use the model in full language modeling mode
+        # image_attention_mask = tf.zeros((batch_size, seq_length, 1), dtype=tf.int32)
+
+        # this is to account for the dummy inputs
+        if pixel_values is not None and len(pixel_values.shape) == 4 and image_attention_mask is None:
+            image_attention_mask = tf.zeros((batch_size, seq_length, 1), dtype=tf.int32)
+
+        text_seq_len = shape_list(image_attention_mask)[1]
+        image_attention_mask = tf.expand_dims(image_attention_mask, -1)
+        image_attention_mask = tf.repeat(image_attention_mask, repeats=image_seq_len)
+        image_attention_mask = tf.reshape(image_attention_mask, (batch_size, text_seq_len, num_images * image_seq_len))
+
+        if image_hidden_states is not None:
+            image_batch_size, image_sequence_length, _ = shape_list(image_hidden_states)
+            image_hidden_shape = (image_batch_size, image_sequence_length)
+            if image_attention_mask is None:
+                image_attention_mask = tf.ones(image_hidden_shape, dtype=tf.int32)
+            image_attention_mask = invert_attention_mask(image_attention_mask)
+        else:
+            image_attention_mask = None
+
+        cross_attention_gate = tf.squeeze(
+            tf.cast(tf.reduce_any(image_attention_mask == 0, axis=-1), dtype=self.dtype), axis=1
+        )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = tf.ones((batch_size, seq_length_with_past), dtype=tf.bool)
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            def vblock(
+                main_block,
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_value,
+                image_hidden_states,
+                image_attention_mask,
+                cross_attention_gate,
+                output_attentions,
+                use_cache,
+                layer_idx,
+                cross_layer_interval,
+                gated_cross_attn_layers,
+            ):
+                # TODO(ls): Add cross attention values to respective lists
+                if layer_idx % cross_layer_interval == 0:
+                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
+                    outputs = xblock(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        image_hidden_states=image_hidden_states,
+                        image_attention_mask=image_attention_mask,
+                        cross_attention_gate=cross_attention_gate,
+                        output_attentions=output_attentions,
+                        use_cache=use_cache,
+                        past_key_value=None,  # not implemented
+                    )
+                    hidden_states = outputs[0]
+
+                layer_outputs = main_block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+                return layer_outputs
+
+            if self.gradient_checkpointing and training:
+                past_key_value = None
+                if use_cache:
+                    logger.warning_once(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                layer_outputs = tf.recompute_grad(
+                    vblock,
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    image_hidden_states,
+                    image_attention_mask,
+                    output_attentions,
+                    use_cache,
+                    no_images,
+                    idx,
+                    self.cross_layer_interval,
+                    self.gated_cross_attn_layers,
+                )
+            else:
+                layer_outputs = vblock(
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    image_hidden_states=image_hidden_states,
+                    image_attention_mask=image_attention_mask,
+                    cross_attention_gate=cross_attention_gate,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    layer_idx=idx,
+                    cross_layer_interval=self.cross_layer_interval,
+                    gated_cross_attn_layers=self.gated_cross_attn_layers,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        image_hidden_states = tf.reshape(
+            image_hidden_states, (batch_size, num_images, image_seq_len, image_hidden_size)
+        )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states]
+                if v is not None
+            )
+        return TFIdeficsBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            image_hidden_states=image_hidden_states,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_tokens", None) is not None:
+            with tf.name_scope(self.embed_tokens.name):
+                self.embed_tokens.build(None)
+        if getattr(self, "vision_model", None) is not None:
+            with tf.name_scope(self.vision_model.name):
+                self.vision_model.build(None)
+        if getattr(self, "norm", None) is not None:
+            with tf.name_scope(self.norm.name):
+                self.norm.build(None)
+        if getattr(self, "perceiver_resampler", None) is not None:
+            with tf.name_scope(self.perceiver_resampler.name):
+                self.perceiver_resampler.build(None)
+        if getattr(self, "decoder_layers", None) is not None:
+            for layer in self.decoder_layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+        if getattr(self, "gated_cross_attn_layers", None) is not None:
+            for layer in self.gated_cross_attn_layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFIdeficsModel(TFIdeficsPreTrainedModel):
+    def __init__(self, config: IdeficsConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.model = TFIdeficsMainLayer(config, name="model")
+
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        past_key_values: list[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        image_encoder_embeddings: tf.Tensor | None = None,
+        perceiver_embeddings: tf.Tensor | None = None,
+        image_attention_mask: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        interpolate_pos_encoding: bool | None = False,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> TFIdeficsBaseModelOutputWithPast | tuple[tf.Tensor]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_encoder_embeddings=image_encoder_embeddings,
+            perceiver_embeddings=perceiver_embeddings,
+            image_attention_mask=image_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+
+
+class TFIdeficsForVisionText2Text(TFPreTrainedModel, TFCausalLanguageModelingLoss):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]
+    config_class = IdeficsConfig
+
+    def __init__(self, config, vision_model=None, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = TFIdeficsMainLayer(config, name="model")
+        self.lm_head = TFIdeficsDecoupledLinear(
+            config.hidden_size,
+            config.vocab_size,
+            config.additional_vocab_size,
+            bias=False,
+            partially_freeze=config.freeze_lm_head,
+            name="lm_head",
+        )
+
+    def tie_weights(self):
+        """
+        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
+        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
+        """
+        output_embeddings = self.get_output_embeddings()
+        input_embeddings = self.get_input_embeddings()
+
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings.weight = input_embeddings.weight
+            if input_embeddings.num_additional_embeddings > 0:
+                assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
+                output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
+
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+            output_embeddings.out_features = input_embeddings.num_embeddings
+            if hasattr(output_embeddings, "out_additional_features") and hasattr(
+                input_embeddings, "num_additional_embeddings"
+            ):
+                output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFIdeficsCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        past_key_values: list[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        image_encoder_embeddings: tf.Tensor | None = None,
+        perceiver_embeddings: tf.Tensor | None = None,
+        image_attention_mask: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        interpolate_pos_encoding: bool | None = False,
+        return_dict: bool | None = None,
+        training=False,
+    ) -> TFIdeficsCausalLMOutputWithPast | tuple[tf.Tensor]:
+        r"""
+            labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >> from transformers import AutoTokenizer, TFIdeficsForVisionText2Text
+
+        >> model = TFIdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
+        >> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceM4/idefics-9b")
+
+        >> prompt = "Hey, are you consciours? Can you talk to me?"
+        >> inputs = tokenizer(prompt, return_tensors="tf")
+
+        >> # Generate
+        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_encoder_embeddings=image_encoder_embeddings,
+            perceiver_embeddings=perceiver_embeddings,
+            image_attention_mask=image_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask != 0]
+                shift_labels = labels[..., 1:][shift_attention_mask != 0]
+            else:
+                shift_logits = logits[..., :-1, :]
+                shift_labels = labels[..., 1:]
+            # Flatten the tokens
+            loss = self.hf_compute_loss(
+                labels=tf.reshape(shift_labels, [-1]), logits=tf.reshape(shift_logits, [-1, shift_logits.shape[-1]])
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return TFIdeficsCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        image_hidden_states = kwargs.pop("image_hidden_states", None)
+        if image_hidden_states is not None:
+            if self.config.use_resampler:
+                kwargs["perceiver_embeddings"] = image_hidden_states
+            else:
+                kwargs["image_encoder_embeddings"] = image_hidden_states
+            kwargs["pixel_values"] = None
+        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
+        unwanted_kwargs = ["token_type_ids"]
+        for kwarg in unwanted_kwargs:
+            inputs.pop(kwarg, None)
+        return inputs
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        *args,
+        **model_kwargs,
+    ):
+        return expand_inputs_for_generation(*args, **model_kwargs)
+
+    @staticmethod
+    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder):
+        return update_model_kwargs_for_generation(outputs, model_kwargs)
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(tf.gather(past_state, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+__all__ = ["TFIdeficsForVisionText2Text", "TFIdeficsModel", "TFIdeficsPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/perceiver.py b/phivenv/Lib/site-packages/transformers/models/idefics/perceiver.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7372c6e724cf8d0600f58bb7263405e8d608a79
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/perceiver.py
@@ -0,0 +1,189 @@
+# This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
+#
+# MIT License
+#
+# Copyright (c) 2020  The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+"""
+
+Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
+time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
+that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
+prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
+to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.
+
+References:
+    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
+    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
+
+"""
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from .configuration_idefics import IdeficsConfig
+
+
+class IdeficsPerceiverResampler(nn.Module):
+    def __init__(
+        self, config: IdeficsConfig, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int
+    ) -> None:
+        """
+        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
+        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
+        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
+        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
+        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.
+
+        Args:
+            config (`IdeficsConfig`): config object
+            embed_dim (`int`): The size of each embedding vector
+            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
+            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
+            n_latents (`int`):
+                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+
+        """
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
+        self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
+
+        # Create Latents for Perceiver
+        self.latents = nn.Parameter(torch.randn(self.n_latents, self.embed_dim), requires_grad=True)
+
+        self.intermediate_dim = (
+            self.embed_dim * 4
+            if not hasattr(config.vision_config, "embed_dim")
+            else config.vision_config.embed_dim * 4
+        )
+        # Create Transformer Blocks
+        self.blocks = nn.ModuleList(
+            [
+                nn.ModuleList(
+                    [
+                        IdeficsPerceiverAttention(self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms),
+                        IdeficsMLP(self.intermediate_dim, config),
+                    ]
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
+        # einsum.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
+        latents = self.latents.repeat(context.shape[0], 1, 1)
+
+        # Feed through Perceiver Attention blocks...
+        for attn, ff in self.blocks:
+            latents = attn(context, latents) + latents
+            latents = ff(latents) + latents
+
+        return self.layer_norm(latents)
+
+
+class IdeficsPerceiverAttention(nn.Module):
+    def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
+        self.qk_layer_norms = qk_layer_norms
+        # Normalization & Scaling
+        self.context_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.latents_layer_norm = nn.LayerNorm(self.embed_dim)
+        if self.qk_layer_norms:
+            self.q_layer_norm = nn.LayerNorm(self.head_dim)
+            self.k_layer_norm = nn.LayerNorm(self.head_dim)
+
+        self.qk_scale = self.head_dim**-0.5
+
+        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
+        self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
+
+        self.output_proj = nn.Linear(self.n_heads * self.head_dim, embed_dim, bias=False)
+
+    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+
+        Args:
+            context (`torch.Tensor`):
+                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
+            latents (`torch.Tensor`):
+                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.
+
+        Returns:
+            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
+            from context.
+        """
+        context = self.context_layer_norm(context)
+        latents = self.latents_layer_norm(latents)
+        batch_size, seq_length, embed_dim = context.shape[:3]
+
+        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
+        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
+        q = self.q_proj(latents)
+        k = self.k_proj(torch.cat([context, latents], dim=-2))
+        v = self.v_proj(torch.cat([context, latents], dim=-2))
+
+        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
+        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
+        # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
+        q, k, v = [x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(1, 2) for x in (q, k, v)]
+
+        if self.qk_layer_norms:
+            q = self.q_layer_norm(q)
+            k = self.k_layer_norm(k)
+
+        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
+        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
+        attn = stabilized_scores.softmax(dim=-1)
+
+        # Attend & project back to output...
+        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
+        # einsum.rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
+        return self.output_proj(resampled.transpose(1, 2).flatten(-2))
+
+
+class IdeficsMLP(nn.Module):
+    def __init__(self, intermediate_size, config: IdeficsConfig):
+        """Simple MLP block with intermediate_size and embedding size"""
+        super().__init__()
+        self.embed_dim = config.vision_config.embed_dim
+        self.ln = nn.LayerNorm(self.embed_dim)
+        self.fc = nn.Linear(self.embed_dim, intermediate_size, bias=False)
+        self.act = nn.ReLU()
+        self.c_proj = nn.Linear(intermediate_size, self.embed_dim, bias=False)
+
+    def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+
+        return hidden_states
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/perceiver_tf.py b/phivenv/Lib/site-packages/transformers/models/idefics/perceiver_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4de96b68e780828ff8106de800ed8b7d3d1469f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/perceiver_tf.py
@@ -0,0 +1,195 @@
+# This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
+#
+# MIT License
+#
+# Copyright (c) 2020  The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+"""
+
+Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
+time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
+that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
+prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
+to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.
+
+References:
+    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
+    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
+
+"""
+
+from typing import Optional
+
+import tensorflow as tf
+
+from ...modeling_tf_utils import shape_list
+from .configuration_idefics import IdeficsConfig
+
+
+class TFIdeficsPerceiverResampler(tf.keras.layers.Layer):
+    def __init__(
+        self, config: IdeficsConfig, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int, **kwargs
+    ) -> None:
+        """
+        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
+        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
+        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
+        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
+        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.
+
+        Args:
+            config (`IdeficsConfig`): config object
+            embed_dim (`int`): The size of each embedding vector
+            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
+            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
+            n_latents (`int`):
+                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+
+        """
+        super().__init__(**kwargs)
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
+        self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
+
+        self.intermediate_dim = (
+            self.embed_dim * 4
+            if not hasattr(config.vision_config, "embed_dim")
+            else config.vision_config.embed_dim * 4
+        )
+        # Create Transformer Blocks
+        self.blocks = []
+        for i in range(depth):
+            self.blocks.append(
+                [
+                    TFIdeficsPerceiverAttention(
+                        self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms, name=f"blocks.{i}.0"
+                    ),
+                    TFIdeficsMLP(self.intermediate_dim, config, name=f"blocks.{i}.1"),
+                ]
+            )
+
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+
+    def build(self, input_shape):
+        # Create Latents for Perceiver
+        self.latents = self.add_weight(
+            shape=(self.n_latents, self.embed_dim), initializer="random_normal", trainable=True, name="latents"
+        )
+        super().build(input_shape)
+
+    def call(self, context: tf.Tensor) -> tf.Tensor:
+        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
+        # tf.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
+        latents = tf.expand_dims(self.latents, axis=0)
+        latents = tf.tile(latents, [tf.shape(context)[0], 1, 1])
+        # Feed through Perceiver Attention blocks...
+        for attn, ff in self.blocks:
+            latents = attn(context, latents) + latents
+            latents = ff(latents) + latents
+        return self.layer_norm(latents)
+
+
+class TFIdeficsPerceiverAttention(tf.keras.layers.Layer):
+    def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool, **kwargs) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__(**kwargs)
+        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
+        self.qk_layer_norms = qk_layer_norms
+        # Normalization & Scaling
+        self.context_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="context_layer_norm")
+        self.latents_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="latents_layer_norm")
+        if self.qk_layer_norms:
+            self.q_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="q_layer_norm")
+            self.k_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="k_layer_norm")
+
+        self.qk_scale = self.head_dim**-0.5
+
+        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
+        self.q_proj = tf.keras.layers.Dense(self.n_heads * self.head_dim, use_bias=False, name="q_proj")
+        self.k_proj = tf.keras.layers.Dense(self.n_heads * self.head_dim, use_bias=False, name="k_proj")
+        self.v_proj = tf.keras.layers.Dense(self.n_heads * self.head_dim, use_bias=False, name="v_proj")
+
+        self.output_proj = tf.keras.layers.Dense(embed_dim, use_bias=False, name="output_proj")
+
+    def call(self, context: tf.Tensor, latents: tf.Tensor) -> tf.Tensor:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+
+        Args:
+            context (`tf.Tensor`):
+                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
+            latents (`tf.Tensor`):
+                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.
+
+        Returns:
+            `tf.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
+            from context.
+        """
+        context = self.context_layer_norm(context)
+        latents = self.latents_layer_norm(latents)
+        batch_size, seq_length, embed_dim = shape_list(context)
+
+        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
+        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
+        q = self.q_proj(latents)
+        k = self.k_proj(tf.concat([context, latents], axis=-2))
+        v = self.v_proj(tf.concat([context, latents], axis=-2))
+
+        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
+        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
+        q, k, v = [
+            tf.transpose(tf.reshape(x, (batch_size, x.shape[1], self.n_heads, self.head_dim)), perm=[0, 2, 1, 3])
+            for x in (q, k, v)
+        ]
+
+        if self.qk_layer_norms:
+            q = self.q_layer_norm(q)
+            k = self.k_layer_norm(k)
+
+        scores = tf.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
+        stabilized_scores = scores - tf.reduce_max(scores, axis=-1, keepdims=True)
+        attn = tf.nn.softmax(stabilized_scores, axis=-1)
+
+        # Attend & project back to output...
+        resampled = tf.einsum("... i j, ... j d -> ... i d", attn, v)
+        return self.output_proj(
+            tf.reshape(tf.transpose(resampled, perm=[0, 2, 1, 3]), (batch_size, -1, self.n_heads * self.head_dim))
+        )
+
+
+class TFIdeficsMLP(tf.keras.layers.Layer):
+    def __init__(self, intermediate_size, config: IdeficsConfig, **kwargs):
+        """Simple MLP block with intermediate_size and embedding size"""
+        super().__init__(**kwargs)
+        self.embed_dim = config.vision_config.embed_dim
+        self.ln = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="ln")
+        self.fc = tf.keras.layers.Dense(intermediate_size, use_bias=False, name="fc")
+        self.act = tf.keras.layers.ReLU(name="act")
+        self.c_proj = tf.keras.layers.Dense(self.embed_dim, use_bias=False, name="c_proj")
+
+    def call(self, hidden_states: Optional[tuple[tf.Tensor]]) -> tf.Tensor:
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+
+        return hidden_states
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/processing_idefics.py b/phivenv/Lib/site-packages/transformers/models/idefics/processing_idefics.py
new file mode 100644
index 0000000000000000000000000000000000000000..259f35feb7ec17d65f52037ed59ce364adf2ad3e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/processing_idefics.py
@@ -0,0 +1,528 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for IDEFICS.
+"""
+
+from typing import Callable, Optional, Union
+from urllib.parse import urlparse
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import is_tf_available, is_torch_available
+from ...utils.deprecation import deprecate_kwarg
+
+
+if is_torch_available():
+    import torch
+
+if is_tf_available():
+    import tensorflow as tf
+
+IMAGE_TOKEN = "<image>"
+
+
+class IdeficsImagesKwargs(ImagesKwargs, total=False):
+    transform: Optional[Callable]
+    image_size: Optional[dict[str, int]]
+    image_mean: Optional[Union[float, list[float]]]
+    image_std: Optional[Union[float, list[float]]]
+
+
+class IdeficsTextKwargs(TextKwargs, total=False):
+    add_eos_token: Optional[bool]
+    add_end_of_utterance_token: Optional[bool]
+
+
+class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: IdeficsTextKwargs
+    images_kwargs: IdeficsImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": False,
+            "padding": "longest",
+            "add_eos_token": False,
+        },
+        "images_kwargs": {},
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
+# copied from m4.training.packing
+def incremental_to_binary_attention_mask(incremental_mask, return_tensors, num_classes=-1):
+    # Set elements >= num_classes to -1
+    if num_classes != -1:
+        if return_tensors == "pt":
+            incremental_mask[incremental_mask >= num_classes] = -1
+        elif return_tensors == "tf":
+            incremental_mask = tf.where(incremental_mask >= num_classes, -1, incremental_mask)
+
+    # Create mask for negative values
+    if return_tensors == "pt":
+        negatives = incremental_mask == -1
+        incremental_mask[negatives] = 0
+        attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
+        attn_mask[negatives, :] = 0
+    elif return_tensors == "tf":
+        negatives = tf.equal(incremental_mask, -1)
+        incremental_mask = tf.where(negatives, 0, incremental_mask)
+        attn_mask = tf.one_hot(incremental_mask, depth=num_classes)
+        # Reshape 'negatives' to add an extra dimension, making it [batch_size, seq_length, 1]
+        negatives_expanded = tf.expand_dims(negatives, -1)
+        attn_mask = tf.where(negatives_expanded, tf.zeros_like(attn_mask), attn_mask)
+
+    return attn_mask
+
+
+# copied from m4.training.packing
+def image_attention_mask_for_packed_input_ids(input_ids, tokenizer, return_tensors):
+    if return_tensors == "pt":
+        return image_attention_mask_for_packed_input_ids_pt(input_ids, tokenizer)
+    elif return_tensors == "tf":
+        return image_attention_mask_for_packed_input_ids_tf(input_ids, tokenizer)
+
+
+def image_attention_mask_for_packed_input_ids_pt(input_ids, tokenizer):
+    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    eod_token_id = tokenizer.eos_token_id
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx, token_id in enumerate(input_ids[batch_idx]):
+            if token_id == image_token_id:
+                count += 1
+                image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                image_attention_mask[batch_idx][idx] = count
+
+            if seen_eod:
+                image_attention_mask[batch_idx][idx] = -1
+
+            if token_id == eod_token_id:
+                seen_eod = True
+
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
+            token_id = input_ids[batch_idx][idx]
+            if token_id == image_token_id:
+                count += 1
+                next_image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                next_image_attention_mask[batch_idx][idx] = count
+
+            if token_id == eod_token_id:
+                seen_eod = True
+
+            if seen_eod:
+                next_image_attention_mask[batch_idx][idx] = -1
+
+        non_negative_indices = next_image_attention_mask[batch_idx] != -1
+        next_image_attention_mask[batch_idx][non_negative_indices] -= count
+        next_image_attention_mask[batch_idx][non_negative_indices] *= -1
+
+    return image_attention_mask, next_image_attention_mask
+
+
+def image_attention_mask_for_packed_input_ids_tf(input_ids, tokenizer):
+    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    eod_token_id = tokenizer.eos_token_id
+    batch_size = tf.shape(input_ids)[0]
+    image_attention_mask = tf.fill(tf.shape(input_ids), -1)
+    next_image_attention_mask = tf.fill(tf.shape(input_ids), -1)
+
+    for batch_idx in range(batch_size):
+        count = -1
+        seen_eod = False
+        seq_length = tf.shape(input_ids)[1]
+
+        for idx in range(seq_length - 1, -1, -1):
+            token_id = input_ids[batch_idx, idx].numpy()
+            if token_id == image_token_id:
+                count += 1
+                indices = [[batch_idx, idx]]
+                updates = [count]
+                image_attention_mask = tf.tensor_scatter_nd_update(image_attention_mask, indices, updates)
+                next_image_attention_mask = tf.tensor_scatter_nd_update(next_image_attention_mask, indices, updates)
+            elif token_id == eod_token_id and not seen_eod:
+                seen_eod = True
+                count = 0
+                indices = [[batch_idx, idx]]
+                updates = [count]
+                next_image_attention_mask = tf.tensor_scatter_nd_update(next_image_attention_mask, indices, updates)
+            if seen_eod and token_id != eod_token_id:
+                indices = [[batch_idx, idx]]
+                updates = [-1]
+                next_image_attention_mask = tf.tensor_scatter_nd_update(next_image_attention_mask, indices, updates)
+    return image_attention_mask, next_image_attention_mask
+
+
+def is_url(string):
+    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url"""
+    if " " in string:
+        return False
+    result = urlparse(string)
+    return all([result.scheme, result.netloc])
+
+
+class IdeficsProcessor(ProcessorMixin):
+    r"""
+    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
+
+    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`IdeficsImageProcessor`):
+            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizerFast`):
+            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
+        image_size (`int`, *optional*, defaults to 224):
+            Image size (assuming a square image)
+        add_end_of_utterance_token (`str`, *optional*):
+            The string representation of token representing end of utterance
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "IdeficsImageProcessor"
+    tokenizer_class = "LlamaTokenizerFast"
+
+    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if hasattr(tokenizer, "image_token")
+            else tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        )
+
+        self.default_image_dims = (
+            self.image_processor.image_num_channels,
+            self.image_processor.image_size,
+            self.image_processor.image_size,
+        )
+
+        self.tokenizer_was_trained_with_end_of_utterance_token = (
+            "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
+        )
+
+    @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
+    def __call__(
+        self,
+        images: Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]] = None,
+        text: Union[
+            TextInput,
+            PreTokenizedInput,
+            list[TextInput],
+            list[PreTokenizedInput],
+            list[list[TextInput]],
+            list[list[PreTokenizedInput]],
+        ] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[IdeficsProcessorKwargs],
+    ) -> BatchFeature:
+        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
+        the model was trained on and prepares the image pixel values for the model to process.
+
+        Args:
+            images (`Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]]`):
+                either a single image or a batched list of images - can be passed in when text contains only text prompts,
+                in order to use the image-text-to-text behavior.
+            text (`Union[list[TextInput], [list[list[TextInput]]]]`):
+                either a single prompt or a batched list of prompts - see the detailed description immediately after
+                the end of the arguments doc section.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
+                The type of tensors to return. Can be one of:
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+
+        Returns:
+            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
+            directly passed to `model.generate`
+
+        Detailed explanation:
+
+        Each entry in `text` is either a text to be passed as is or an image that will be processed.
+
+        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+
+        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
+        entry into the prompt.
+
+        Example:
+
+        ```python
+        checkpoint = "HuggingFaceM4/idefics-9b"
+        processor = AutoProcessor.from_pretrained(checkpoint)
+        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
+        img = processor.image_processor.fetch_images([url])[0]
+
+        prompts = [
+            "User:",
+            img,
+            "Describe this image.\nAssistant: An image of two kittens in grass.\n",
+            "User:",
+            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
+            "Describe this image.\nAssistant:",
+        ]
+
+        inputs = processor(text=prompts, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_length=100)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```
+
+        In this example the `prompts` will be converted into:
+
+        ```
+        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant: An image of two kittens in grass.
+        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant:'
+        ```
+
+        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
+        `pixel_values` dict entry of the return value.
+
+        This example also exemplifies that images can be passed as objects or as text urls. It can be seen that the
+        first image is passed as object and the second one as a url.
+
+        To do training do:
+
+        ```python
+        image_transform = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=self.image_mean, std=self.image_std),
+            ]
+        )
+        inputs = processor(text=prompts, transform=image_transform, return_tensors="pt")
+        ```
+
+        In order to help debug prompt generation enable `debug=True` which will show you what's happening.
+
+        """
+        if images is None and text is None:
+            raise ValueError("You need to specify either `text` or `images` and `text`.")
+
+        if images is None:
+            # assuming the user wants to use the old behavior with prompts as the only argument
+            prompts = text
+        elif text is not None:
+            # Assuming image-text-to-text behavior:
+            # Check if batched images are provided
+            if not isinstance(images, (list, tuple)):
+                images = [images]
+            if isinstance(text, str):
+                text = [text]
+            # Check if batched images and text are in the correct format
+            if isinstance(text, (list, tuple)) and len(text) != len(images):
+                raise ValueError(
+                    "When providing both images and text arguments, the number of text prompts should be the same as the number of images."
+                    "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]."
+                )
+            # Check that only text is present in the prompts
+            if not all(isinstance(i, str) for i in text):
+                raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
+            if isinstance(images[0], (list, tuple)):
+                # if nested images, nest text as well
+                text = [[i] for i in text]
+            prompts = list(zip(images, text))
+
+        output_kwargs = self._merge_kwargs(
+            IdeficsProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
+        add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
+
+        # if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
+        if add_end_of_utterance_token is None:
+            add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
+        # turn non-batched prompts into batched
+        if not any(isinstance(i, (list, tuple)) for i in prompts):
+            prompts = [prompts]
+
+        fake_token = "<fake_token_around_image>"
+        image_token = "<image>"
+        end_of_utterance_token = "<end_of_utterance>"
+
+        def image_tokens(last_was_image):
+            if last_was_image:
+                return image_token + fake_token
+            else:
+                return fake_token + image_token + fake_token
+
+        all_prompts = []
+        all_images = []
+        for sample in prompts:
+            # the model was trained on samples starting with <s>
+            full_text = f"{self.tokenizer.bos_token}"
+
+            # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
+            image_objects = []
+            last_was_image = False
+            last_was_text = False
+            for i, item in enumerate(sample):
+                if i > 0:
+                    last_was_text = bool(not last_was_image)
+
+                if isinstance(item, str):
+                    item = item.strip(" ")
+                    if is_url(item):
+                        image = self.image_processor.fetch_images(item)
+                        full_text += image_tokens(last_was_image)
+                        image_objects.append(image)
+                        last_was_image = True
+                    else:
+                        # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
+                        if add_end_of_utterance_token and last_was_text:
+                            full_text += end_of_utterance_token
+                        full_text += item
+                        last_was_image = False
+                else:
+                    # must be an image obj
+                    full_text += image_tokens(last_was_image)
+                    image_objects.append(item)
+                    last_was_image = True
+
+            if add_eos_token:
+                full_text += self.tokenizer.eos_token
+
+            image_objects = self.image_processor(image_objects, **output_kwargs["images_kwargs"])
+
+            all_prompts.append(full_text)
+            all_images.append(image_objects)
+
+        # For BC
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt")
+        text_encoding = self.tokenizer(all_prompts, **output_kwargs["text_kwargs"])
+        all_texts = text_encoding["input_ids"]
+        all_attention_masks = text_encoding["attention_mask"]
+
+        # max_num_images has to be at least 1 even when there are no images
+        max_num_images = max(len(x) for x in all_images)
+        max_num_images = max(1, max_num_images)
+
+        at_least_one_image = sum(len(x) for x in all_images) > 0
+        output_input_ids = []
+        output_images = []
+        output_attention_masks = []
+
+        for text_single, attention_mask, extracted_images in zip(all_texts, all_attention_masks, all_images):
+            padded_input_ids = text_single
+            image_count = padded_input_ids.count(self.image_token_id)
+            local_max_num_images = min(image_count, max_num_images)
+
+            current_images = extracted_images[:local_max_num_images]
+
+            if len(current_images) > 0:
+                if return_tensors == "pt":
+                    padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
+                    padded_image_tensor[: current_images.size(0)] = current_images
+                elif return_tensors == "tf":
+                    # Assuming current_images is a TensorFlow tensor
+                    # Get the shape of current_images, excluding the first dimension
+                    image_shape = tf.shape(current_images)[1:]
+                    # Create a shape for the padded_image_tensor
+                    padded_shape = tf.concat([[max_num_images], image_shape], axis=0)
+                    # Create the padded_image_tensor of zeros
+                    padded_image_tensor = tf.zeros(padded_shape, dtype=current_images.dtype)
+                    # Get the number of images (assuming current_images has shape [num_images, height, width, channels])
+                    num_images = tf.shape(current_images)[0]
+                    # Update the padded_image_tensor with the values from current_images
+                    indices = tf.reshape(tf.range(num_images), (-1, 1))
+                    updates = current_images
+                    padded_image_tensor = tf.tensor_scatter_nd_update(padded_image_tensor, indices, updates)
+            else:
+                if return_tensors == "pt":
+                    padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
+                elif return_tensors == "tf":
+                    padded_image_tensor = tf.zeros((max_num_images, *self.default_image_dims))
+
+            output_images.append(padded_image_tensor)
+            if return_tensors == "pt":
+                output_input_ids.append(torch.tensor(padded_input_ids))
+                output_attention_masks.append(torch.tensor(attention_mask))
+            elif return_tensors == "tf":
+                output_input_ids.append(tf.convert_to_tensor(padded_input_ids, dtype=tf.int32))
+                output_attention_masks.append(attention_mask)
+
+        if return_tensors == "pt":
+            output_input_ids = torch.stack(output_input_ids)
+            output_images = torch.stack(output_images)
+            output_attention_masks = torch.stack(output_attention_masks)
+        elif return_tensors == "tf":
+            output_input_ids = tf.stack(output_input_ids)
+            output_images = tf.stack(output_images)
+            output_attention_masks = tf.stack(output_attention_masks)
+
+        if at_least_one_image:
+            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(
+                output_input_ids, self.tokenizer, return_tensors
+            )
+            image_attention_mask = incremental_to_binary_attention_mask(
+                image_attention_mask, return_tensors, num_classes=max_num_images
+            )
+        else:
+            # in full language mode we set the image mask to all-0s
+            if return_tensors == "pt":
+                image_attention_mask = torch.zeros(
+                    output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=torch.bool
+                )
+            elif return_tensors == "tf":
+                image_attention_mask = tf.zeros(
+                    (output_input_ids.shape[0], output_input_ids.shape[1], 1), dtype=tf.bool
+                )
+        return BatchFeature(
+            data={
+                "input_ids": output_input_ids,
+                "attention_mask": output_attention_masks,
+                "pixel_values": output_images,
+                "image_attention_mask": image_attention_mask,
+            }
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(tokenizer_input_names + image_processor_input_names + ["image_attention_mask"])
+
+
+__all__ = ["IdeficsProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/vision.py b/phivenv/Lib/site-packages/transformers/models/idefics/vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..8682ff047a8d6b7ff489c4f9b7b3c49652a35330
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/vision.py
@@ -0,0 +1,483 @@
+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
+
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...utils import (
+    ModelOutput,
+    can_return_tuple,
+    logging,
+)
+from .configuration_idefics import IdeficsVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class IdeficsVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+# Adapted from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings
+class IdeficsVisionEmbeddings(nn.Module):
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    # Heavily inspired from https://github.com/huggingface/transformers/blob/v4.33.0/src/transformers/models/vit/modeling_vit.py#L82
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        pos_embed = self.position_embedding(self.position_ids)
+        num_positions = pos_embed.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return pos_embed
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+
+        embed_dim = embeddings.shape[-1]
+        num_h_patches = height // self.config.patch_size
+        num_w_patches = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        num_h_patches, num_w_patches = num_h_patches + 0.1, num_w_patches + 0.1
+        sqrt_num_positions = math.sqrt(num_positions)
+        patch_pos_embed = patch_pos_embed.reshape(1, int(sqrt_num_positions), int(sqrt_num_positions), embed_dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        fp32_upcasting = patch_pos_embed.dtype == torch.bfloat16
+        if fp32_upcasting:
+            logger.warning_once(
+                "Upcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate "
+                "is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead."
+            )
+            patch_pos_embed = patch_pos_embed.to(torch.float)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(num_h_patches / sqrt_num_positions, num_w_patches / sqrt_num_positions),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if fp32_upcasting:
+            patch_pos_embed = patch_pos_embed.to(torch.bfloat16)
+        if int(num_h_patches) != patch_pos_embed.shape[-2] or int(num_w_patches) != patch_pos_embed.shape[-1]:
+            raise ValueError(
+                f"Number of patches for images ({int(num_h_patches), int(num_w_patches)}) don't match the "
+                f"shape of position embedding ({patch_pos_embed.shape[-2], patch_pos_embed.shape[-1]})"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, embed_dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding:
+            if height != self.image_size or width != self.image_size:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size}*{self.image_size}). You should try to set `interpolate_pos_encoding=True`"
+                )
+
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class IdeficsVisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation != "flash_attention_2":
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask
+        else:
+            self.is_causal = causal_attention_mask is not None
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->IdeficsVision
+class IdeficsVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->IdeficsVision
+class IdeficsVisionEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = IdeficsVisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = IdeficsVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->IdeficsVision
+class IdeficsVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`IdeficsVisionEncoderLayer`].
+
+    Args:
+        config: IdeficsVisionConfig
+    """
+
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer
+class IdeficsVisionTransformer(nn.Module):
+    def __init__(self, config: IdeficsVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = IdeficsVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = IdeficsVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    # Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics/vision_tf.py b/phivenv/Lib/site-packages/transformers/models/idefics/vision_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db18f265db5b42cb7296aa40d4600c3e7cbe022
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics/vision_tf.py
@@ -0,0 +1,572 @@
+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
+from ...modeling_tf_utils import TFPreTrainedModel, shape_list
+from ...tf_utils import flatten
+from ...utils import ModelOutput, logging
+from .configuration_idefics import IdeficsVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class TFIdeficsVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[tf.Tensor] = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    hidden_states: Optional[tuple[tf.Tensor]] = None
+    attentions: Optional[tuple[tf.Tensor]] = None
+
+
+class TFIdeficsVisionEmbeddings(tf.keras.layers.Layer):
+    def __init__(self, config: IdeficsVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = tf.keras.layers.Conv2D(
+            filters=self.embed_dim,
+            kernel_size=self.patch_size,
+            strides=self.patch_size,
+            use_bias=False,
+            padding="valid",
+            data_format="channels_last",
+            name="patch_embedding",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = tf.keras.layers.Embedding(
+            self.num_positions, self.embed_dim, name="position_embedding"
+        )
+        # self.position_ids = tf.range(self.num_positions)[tf.newaxis, :]
+
+    def interpolate_pos_encoding(self, embeddings: tf.Tensor, height: int, width: int) -> tf.Tensor:
+        num_patches = shape_list(embeddings)[1] - 1
+        pos_embed = self.position_embedding(self.position_ids)
+        num_positions = shape_list(pos_embed)[1] - 1
+        if num_patches == num_positions and height == width:
+            return pos_embed
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+
+        embed_dim = shape_list(embeddings)[-1]
+        num_h_patches = height // self.config.patch_size
+        num_w_patches = width // self.config.patch_size
+        num_h_patches, num_w_patches = num_h_patches + 0.1, num_w_patches + 0.1
+        sqrt_num_positions = math.sqrt(float(num_positions))
+        patch_pos_embed = tf.reshape(patch_pos_embed, (1, int(sqrt_num_positions), int(sqrt_num_positions), embed_dim))
+
+        scale_height = num_h_patches / sqrt_num_positions
+        scale_width = num_w_patches / sqrt_num_positions
+        original_height = tf.cast(tf.shape(patch_pos_embed)[1], tf.float32)
+        original_width = tf.cast(tf.shape(patch_pos_embed)[2], tf.float32)
+        # Apply scaling
+        new_height = tf.cast(original_height * scale_height, tf.int32)
+        new_width = tf.cast(original_width * scale_width, tf.int32)
+
+        patch_pos_embed = tf.image.resize(
+            patch_pos_embed, size=[new_height, new_width], method=tf.image.ResizeMethod.BICUBIC
+        )
+
+        if (
+            int(num_h_patches) != shape_list(patch_pos_embed)[-3]
+            or int(num_w_patches) != shape_list(patch_pos_embed)[-2]
+        ):
+            raise ValueError(
+                f"Number of patches for images ({int(num_h_patches), int(num_w_patches)}) don't match the "
+                f"shape of position embedding ({shape_list(patch_pos_embed)[-2], shape_list(patch_pos_embed)[-1]})"
+            )
+        patch_pos_embed = tf.reshape(patch_pos_embed, (1, -1, embed_dim))
+        return tf.concat((class_pos_embed[tf.newaxis, :], patch_pos_embed), axis=1)
+
+    def call(self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False) -> tf.Tensor:
+        # Input `pixel_values` is NCHW format which doesn't run on CPU so first thing we do is
+        # transpose it to change it to NHWC. We don't care to transpose it back because
+        # the Conv2D layer is only hit once for each query
+
+        if isinstance(pixel_values, dict):
+            pixel_values = pixel_values["pixel_values"]
+
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        batch_size, height, width, num_channels = shape_list(pixel_values)
+        if not interpolate_pos_encoding:
+            if height != self.image_size or width != self.image_size:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size}*{self.image_size}). You should try to set `interpolate_pos_encoding=True`"
+                )
+
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        # Change the 2D spatial dimensions to a single temporal dimension.
+        # shape = (batch_size, num_patches, out_channels=embed_dim)
+        patch_embeds = flatten(patch_embeds, 1, 2)
+
+        class_embeds = tf.broadcast_to(
+            self.class_embedding[tf.newaxis, tf.newaxis, :], [batch_size, 1, self.embed_dim]
+        )
+        embeddings = tf.concat([class_embeds, patch_embeds], axis=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        self.position_ids = tf.range(self.num_positions, name="self.position_ids")[tf.newaxis, :]
+        self.class_embedding = self.add_weight(shape=(self.embed_dim,), name="class_embedding")
+        if getattr(self, "patch_embedding", None) is not None:
+            with tf.name_scope(self.patch_embedding.name):
+                self.patch_embedding.build([None, None, None, self.config.num_channels])
+        if getattr(self, "position_embedding", None) is not None:
+            with tf.name_scope(self.position_embedding.name):
+                self.position_embedding.build(None)
+
+
+class TFIdeficsVisionAttention(tf.keras.layers.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = tf.keras.layers.Dense(self.embed_dim, name="k_proj")
+        self.v_proj = tf.keras.layers.Dense(self.embed_dim, name="v_proj")
+        self.q_proj = tf.keras.layers.Dense(self.embed_dim, name="q_proj")
+        self.out_proj = tf.keras.layers.Dense(self.embed_dim, name="out_proj")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: Optional[tf.Tensor] = None,
+        causal_attention_mask: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[tf.Tensor, Optional[tf.Tensor], Optional[tuple[tf.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.linalg.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            tf.shape(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=f"Attention weights should be of size {[bsz * self.num_heads, tgt_len, src_len]}, but is {tf.shape(attn_weights)}",
+        )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if shape_list(causal_attention_mask) != [bsz, 1, tgt_len, src_len]:
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {shape_list(causal_attention_mask)}"
+                )
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + causal_attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        if attention_mask is not None:
+            if shape_list(attention_mask) != [bsz, 1, tgt_len, src_len]:
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {shape_list(attention_mask)}"
+                )
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+            attn_weights = tf.reshape(attn_weights_reshaped, (bsz * self.num_heads, tgt_len, src_len))
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = tf.nn.dropout(attn_weights, rate=self.dropout)
+
+        attn_output = tf.linalg.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            tf.shape(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=f"Attention weights should be of size {[bsz * self.num_heads, tgt_len, self.head_dim]}, but is {tf.shape(attn_output)}",
+        )
+
+        attn_output = tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim))
+        attn_output = tf.transpose(attn_output, perm=[0, 2, 1, 3])
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build((self.embed_dim, self.embed_dim))
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build((self.embed_dim, self.embed_dim))
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build((self.embed_dim, self.embed_dim))
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build((self.embed_dim, self.embed_dim))
+
+
+class TFIdeficsVisionMLP(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.activation_fn = get_tf_activation(config.hidden_act)
+        self.fc1 = tf.keras.layers.Dense(config.intermediate_size, name="fc1")
+        self.fc2 = tf.keras.layers.Dense(config.hidden_size, name="fc2")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build(self.config.hidden_size)
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build(self.config.intermediate_size)
+
+
+class TFIdeficsVisionEncoderLayer(tf.keras.layers.Layer):
+    def __init__(self, config: IdeficsVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.hidden_size
+        self.self_attn = TFIdeficsVisionAttention(config, name="self_attn")
+        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+        self.mlp = TFIdeficsVisionMLP(config, name="mlp")
+        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        causal_attention_mask: tf.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[tf.Tensor]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm1", None) is not None:
+            with tf.name_scope(self.layer_norm1.name):
+                self.layer_norm1.build([None, None, self.embed_dim])
+        if getattr(self, "layer_norm2", None) is not None:
+            with tf.name_scope(self.layer_norm2.name):
+                self.layer_norm2.build([None, None, self.embed_dim])
+
+
+class TFIdeficsVisionEncoder(tf.keras.layers.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`TFIdeficsVisionEncoderLayer`].
+
+    Args:
+        config: IdeficsVisionConfig
+    """
+
+    def __init__(self, config: IdeficsVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layers = [
+            TFIdeficsVisionEncoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)
+        ]
+        self.gradient_checkpointing = False
+
+    def call(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[tf.Tensor] = None,
+        causal_attention_mask: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[tuple, TFBaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = tf.recompute_grad(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFIdeficsVisionTransformer(TFPreTrainedModel):
+    def __init__(self, config: IdeficsVisionConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        self.embed_dim = config.hidden_size
+
+        self.embeddings = TFIdeficsVisionEmbeddings(config, name="embeddings")
+        self.pre_layrnorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm")
+        self.encoder = TFIdeficsVisionEncoder(config, name="encoder")
+        self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
+
+    # Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[tuple, TFBaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "pre_layrnorm", None) is not None:
+            with tf.name_scope(self.pre_layrnorm.name):
+                self.pre_layrnorm.build([None, None, self.embed_dim])
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "post_layernorm", None) is not None:
+            with tf.name_scope(self.post_layernorm.name):
+                self.post_layernorm.build([None, self.embed_dim])
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/__init__.py b/phivenv/Lib/site-packages/transformers/models/idefics2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6715adc9ab8614bbb463b6b700fcdc2ca671d22
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics2/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_idefics2 import *
+    from .image_processing_idefics2 import *
+    from .image_processing_idefics2_fast import *
+    from .modeling_idefics2 import *
+    from .processing_idefics2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a62c362cbd7f91460d8121fa4aba0d8a8403e59c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0286a3a6df8e465b7fd60e1e544bdfb2ada7fb4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fa91b0aa15489e8301af5802fc50c4905e8fa2a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70480f37563b8aadd05d774a0bc47d7dd63683d8
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4db96f936f44ac89f130f1db01802022814906f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcf807eed07ac38f3ee9f760c1da81096c667d3f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/configuration_idefics2.py b/phivenv/Lib/site-packages/transformers/models/idefics2/configuration_idefics2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8fa442a1dbc67276864f033623c5526f3fed750
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics2/configuration_idefics2.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Idefics2 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Idefics2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Idefics2VisionModel`]. It is used to instantiate a
+    Idefics2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics2 model
+    [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation for initializing all weight matrices in the model.
+
+    Example:
+
+    ```python
+    >>> from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
+    >>> from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
+
+    >>> # Initializing a Idefics2VisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = Idefics2VisionConfig()
+
+    >>> # Initializing a Idefics2VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = Idefics2VisionTransformer(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics2_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+
+
+class Idefics2PerceiverConfig(PretrainedConfig):
+    r"""
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the perceiver block.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        resampler_n_latents (`int`, *optional*, defaults to 64):
+            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        resampler_depth (`int`, *optional*, defaults to 3):
+            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3).
+        resampler_n_heads (`int`, *optional*, defaults to 16):
+            Number of heads in each Transformer block (for multi-headed self-attention).
+        resampler_head_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of each head projection in the Transformer block.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            Number of key-value heads in the perceiver attention block.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation for initializing all weight matrices in the model.
+    """
+
+    model_type = "idefics2_perceiver"
+
+    def __init__(
+        self,
+        hidden_act="silu",
+        hidden_size=4096,
+        rms_norm_eps=1e-06,
+        resampler_n_latents=64,
+        resampler_depth=3,
+        resampler_n_heads=16,
+        resampler_head_dim=96,
+        num_key_value_heads=4,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.rms_norm_eps = rms_norm_eps
+        self.resampler_n_latents = resampler_n_latents
+        self.resampler_depth = resampler_depth
+        self.resampler_n_heads = resampler_n_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resampler_head_dim = resampler_head_dim
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        if self.num_key_value_heads > self.resampler_n_heads:
+            raise ValueError(
+                f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to"
+                f" resampler_n_heads={self.resampler_n_heads}"
+            )
+        super().__init__(**kwargs)
+
+
+class Idefics2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Idefics2Model`]. It is used to instantiate a
+    Idefics2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the model of the Idefics2
+    [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should cache the key/value pairs of the attention mechanism.
+        image_token_id (`int`, *optional*, defaults to 32001):
+            The id of the "image" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the token embeddings.
+        vision_config (`IdeficsVisionConfig` or `dict`, *optional*):
+            Custom vision config or dict
+        perceiver_config (`IdeficsPerceiverConfig` or `dict`, *optional*):
+            Custom perceiver config or dict
+        text_config (`MistralConfig` or `dict`, *optional*):
+            Custom text config or dict for the text model
+
+    Example:
+    ```python
+    >>> from transformers import Idefics2Model, Idefics2Config
+    >>> # Initializing configuration
+    >>> configuration = Idefics2Config()
+    >>> # Initializing a model from the configuration
+    >>> model = Idefics2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics2"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "perceiver_config": Idefics2PerceiverConfig,
+        "vision_config": Idefics2VisionConfig,
+    }
+
+    def __init__(
+        self,
+        use_cache=True,
+        image_token_id=32_001,
+        tie_word_embeddings=False,
+        vision_config=None,
+        perceiver_config=None,
+        text_config=None,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+
+        if perceiver_config is None:
+            self.perceiver_config = Idefics2PerceiverConfig()
+            logger.info("perciver_config is None, using default perceiver config")
+        elif isinstance(perceiver_config, dict):
+            self.perceiver_config = Idefics2PerceiverConfig(**perceiver_config)
+        elif isinstance(perceiver_config, Idefics2PerceiverConfig):
+            self.perceiver_config = perceiver_config
+
+        if vision_config is None:
+            self.vision_config = Idefics2VisionConfig()
+            logger.info("vision_config is None, using default vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = Idefics2VisionConfig(**vision_config)
+        elif isinstance(vision_config, Idefics2VisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "mistral")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            logger.info("text_config is None, using default text config")
+            text_config = CONFIG_MAPPING["mistral"](
+                max_position_embeddings=4096 * 8,
+                rms_norm_eps=1e-5,
+                # None in the original configuration_mistral, we set it to the unk_token_id
+                pad_token_id=0,
+                tie_word_embeddings=False,
+            )
+
+        self.text_config = text_config
+        if self.text_config.hidden_size != self.perceiver_config.hidden_size:
+            self.perceiver_config.hidden_size = self.text_config.hidden_size
+            self.perceiver_config.rms_norm_eps = self.text_config.rms_norm_eps
+            logger.warning_once(
+                "Perceiver config has a different `hidden_size` than text config, which means default values were used. "
+                "In your model's config on the hub, add `hidden_size` and `rms_norm_eps` keys under the `perceiver_config` dict. "
+            )
+
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+
+
+__all__ = ["Idefics2Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/image_processing_idefics2.py b/phivenv/Lib/site-packages/transformers/models/idefics2/image_processing_idefics2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e564708a0788f89be543b83fbe741fcfde1f0af
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics2/image_processing_idefics2.py
@@ -0,0 +1,569 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_nested_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+    from PIL import Image
+
+
+def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+
+    Args:
+        image (`np.ndarray`):
+            Image to resize.
+        size (`dict[str, int]`):
+            Size of the output image containing the keys "shortest_edge" and "longest_edge".
+        input_data_format (`ChannelDimension` or `str`):
+            The channel dimension format of the input image.
+
+    Returns:
+        The output size of the image after resizing.
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+
+    min_len = size["shortest_edge"]
+    max_len = size["longest_edge"]
+    aspect_ratio = width / height
+
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return height, width
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+def get_max_height_width(
+    images_list: list[list[np.ndarray]], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+    image_sizes = []
+    for images in images_list:
+        for image in images:
+            image_sizes.append(get_image_size(image, channel_dim=input_data_format))
+
+    max_height, max_width = max_across_indices(image_sizes)
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# FIXME Amy: merge this function with the one in image_transforms.py
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+    Args:
+        image (Image):
+            The image to convert.
+    """
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+
+
+class Idefics2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics image processor.
+
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB. This is useful if the input image is of a different format e.g. RGBA.
+            Only has an effect if the input image is in the PIL format.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image. The longest edge of the image is resized to  be <= `size["longest_edge"]`, with the
+            shortest edge resized to keep the input aspect ratio, with a minimum size of `size["shortest_edge"]`.
+        size (`Dict`, *optional*):
+            Controls the size of the output image. This is a dictionary containing the keys "shortest_edge" and "longest_edge".
+        resample (`Resampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1.
+        rescale_factor (`float`, *optional*, defaults to `1/255`):
+            Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. If set to `True`, the image is normalized to have a mean of `image_mean` and
+            a standard deviation of `image_std`.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad the images to the largest height and width in the batch and number of images per
+            sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
+        do_image_splitting (`bool`, *optional*, defaults to `False`):
+            Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. That
+            strategy was first introduced in https://huggingface.co/papers/2311.06607.
+    """
+
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
+
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: bool = True,
+        do_image_splitting: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_convert_rgb = do_convert_rgb
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"shortest_edge": 378, "longest_edge": 980}
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_pad = do_pad
+        self.do_image_splitting = do_image_splitting
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "shortest_edge" in size and "longest_edge" in size:
+            size = get_resize_output_image_size(image, size, input_data_format)
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "size must be a dictionary with keys 'shortest_edge' and 'longest_edge' or 'height' and 'width'."
+            )
+        return resize(
+            image, size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    def pad(
+        self,
+        images: list[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        For a list of images, for each images, pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width.
+        For each sample in the batch, pads the sample with empty images to the max_number of images per sample in the batch. Optionally returns a pixel mask.
+
+        Args:
+            images (`np.ndarray`):
+                List of list of images to pad. Pads to the largest height and width in the batch.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        batch_size = len(images)
+        max_num_images = max(len(images_) for images_ in images)
+        input_data_format = (
+            infer_channel_dimension_format(images[0][0]) if input_data_format is None else input_data_format
+        )
+        data_format = input_data_format if data_format is None else data_format
+
+        def empty_image(size, input_data_format):
+            if input_data_format == ChannelDimension.FIRST:
+                return np.zeros((3, *size), dtype=np.uint8)
+            elif input_data_format == ChannelDimension.LAST:
+                return np.zeros((*size, 3), dtype=np.uint8)
+            raise ValueError("Invalid channel dimension format.")
+
+        padded_images_list = [
+            [empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size)
+        ]
+        padded_masks = [[np.zeros(pad_size) for _ in range(max_num_images)] for _ in range(batch_size)]
+
+        for batch_idx in range(batch_size):
+            for sample_idx, image in enumerate(images[batch_idx]):
+                padded_images_list[batch_idx][sample_idx] = self._pad_image(
+                    image,
+                    pad_size,
+                    constant_values=constant_values,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                padded_masks[batch_idx][sample_idx] = make_pixel_mask(
+                    image, output_size=pad_size, input_data_format=input_data_format
+                )
+
+        padded_masks = padded_masks if return_pixel_mask else None
+        return padded_images_list, padded_masks
+
+    def _crop(
+        self,
+        im: np.ndarray,
+        w1: int,
+        h1: int,
+        w2: int,
+        h2: int,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        if input_data_format == ChannelDimension.FIRST:
+            return im[:, h1:h2, w1:w2]
+        elif input_data_format == ChannelDimension.LAST:
+            return im[h1:h2, w1:w2, :]
+
+    def split_image(
+        self,
+        image: np.ndarray,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Split an image into 4 equal sub-images, and the concatenate that sequence with the original image.
+        That means that a single image becomes a sequence of 5 images.
+        This is a "trick" to spend more compute on each image with no changes in the vision encoder.
+
+        Args:
+            image (`np.ndarray`):
+                Images to split.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image, input_data_format)
+
+        mid_width = width // 2
+        mid_height = height // 2
+        return [
+            self._crop(image, 0, 0, mid_width, mid_height, input_data_format),
+            self._crop(image, mid_width, 0, width, mid_height, input_data_format),
+            self._crop(image, 0, mid_height, mid_width, height, input_data_format),
+            self._crop(image, mid_width, mid_height, width, height, input_data_format),
+            image,
+        ]
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_convert_rgb: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        do_image_splitting: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[ChannelDimension] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+    ):
+        """
+        Preprocess a batch of images.
+
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether or not to pad the images to the largest height and width in the batch.
+            do_image_splitting (`bool`, *optional*, defaults to `self.do_image_splitting`):
+                Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. That
+                strategy was first introduced in https://huggingface.co/papers/2311.06607.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting
+
+        images = self.fetch_images(images)
+        images_list = make_nested_list_of_images(images)
+
+        if not valid_images(images_list[0]):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
+
+        # All transformations expect numpy arrays.
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+
+        if do_rescale and is_scaled_image(images_list[0][0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+        if do_image_splitting:
+            new_images_list = []
+            for images in images_list:
+                new_images = []
+                for image in images:
+                    new_images.extend(self.split_image(image, input_data_format))
+                new_images_list.append(new_images)
+            images_list = new_images_list
+
+        if do_resize:
+            images_list = [
+                [
+                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        if do_rescale:
+            images_list = [
+                [
+                    self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        if do_normalize:
+            images_list = [
+                [
+                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        pixel_attention_mask = None
+        if do_pad:
+            images_list, pixel_attention_mask = self.pad(
+                images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
+            )
+
+        if data_format is not None:
+            images_list = [
+                [
+                    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        data = {"pixel_values": np.array(images_list) if do_pad else images_list}  # Faster tensor conversion
+        if pixel_attention_mask is not None:
+            data["pixel_attention_mask"] = np.array(pixel_attention_mask) if do_pad else pixel_attention_mask
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Idefics2ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/image_processing_idefics2_fast.py b/phivenv/Lib/site-packages/transformers/models/idefics2/image_processing_idefics2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..a22b95cfea970924427ec76a13fcada08fb0753d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics2/image_processing_idefics2_fast.py
@@ -0,0 +1,315 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+import torch
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    make_nested_list_of_images,
+)
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
+from .image_processing_idefics2 import convert_to_rgb
+
+
+if is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_resize_output_image_size(image: "torch.Tensor", size: SizeDict) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+
+    Args:
+        image (`torch.Tensor`):
+            Image to resize.
+        size (`SizeDict`):
+            Size of the output image containing the keys "shortest_edge" and "longest_edge".
+
+    Returns:
+        The output size of the image after resizing.
+    """
+    height, width = image.size()[-2:]
+
+    min_len = size.shortest_edge
+    max_len = size.longest_edge
+    aspect_ratio = width / height
+
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return height, width
+
+
+def get_max_height_width(images_list: list[list["torch.Tensor"]]) -> tuple[int, int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    image_sizes = []
+    for images in images_list:
+        for image in images:
+            image_sizes.append(image.size()[-2:])
+
+    max_height = max(size[0] for size in image_sizes)
+    max_width = max(size[1] for size in image_sizes)
+    return (max_height, max_width)
+
+
+def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "torch.Tensor":
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`torch.Tensor`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = image.size()[-2:]
+    mask = torch.zeros(output_size, dtype=torch.int64, device=image.device)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+class Idefics2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    do_image_splitting (`bool`, *optional*, defaults to `False`):
+        Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
+    do_pad (`bool`, *optional*, defaults to `True`):
+        Whether to pad images to the largest height and width in the batch.
+    """
+
+    do_image_splitting: Optional[bool]
+    do_pad: Optional[bool]
+
+
+@auto_docstring
+class Idefics2ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = True
+    do_convert_rgb = True
+    do_image_splitting = False
+    size = {"shortest_edge": 378, "longest_edge": 980}
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
+    valid_kwargs = Idefics2FastImageProcessorKwargs
+
+    def convert_to_rgb(self, image: ImageInput) -> ImageInput:
+        """
+        Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+        as is.
+        """
+        return convert_to_rgb(image)
+
+    def resize(
+        self, image: torch.Tensor, size: SizeDict, interpolation: Optional["F.InterpolationMode"] = None, **kwargs
+    ) -> torch.Tensor:
+        """
+        Resize an image using torchvision's functional resize.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+
+        if size.shortest_edge and size.longest_edge:
+            new_size = get_resize_output_image_size(image, size)
+        elif size.height and size.width:
+            new_size = (size.height, size.width)
+        else:
+            raise ValueError("Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys.")
+
+        image = F.resize(image, size=new_size, interpolation=interpolation, **kwargs)
+        return image
+
+    def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
+        """
+        Prepare a nested images structure for processing.
+        """
+        return make_nested_list_of_images(images, expected_ndims=expected_ndims)
+
+    def split_images(
+        self,
+        images: "torch.Tensor",
+    ) -> list["torch.Tensor"]:
+        """
+        Split a batch of images into 4 equal sub-images, and concatenate that sequence with the original image.
+        """
+        height, width = images.size()[-2:]
+
+        mid_width = width // 2
+        mid_height = height // 2
+
+        batch_split_images = [
+            images[..., :mid_height, :mid_width],
+            images[..., :mid_height, mid_width:],
+            images[..., mid_height:, :mid_width],
+            images[..., mid_height:, mid_width:],
+            images,
+        ]
+
+        # transpose the batch dimension to the first dimension
+        batch_split_images = [[image[i] for image in batch_split_images] for i in range(len(batch_split_images[0]))]
+        return batch_split_images
+
+    def pad(
+        self, image: "torch.Tensor", padded_size: tuple[int, int], fill: int = 0
+    ) -> tuple["torch.Tensor", "torch.Tensor"]:
+        """
+        Pad an image to the specified size and create the corresponding pixel mask.
+        """
+        original_size = image.shape[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+
+        # Only pad if necessary
+        if original_size != padded_size:
+            # torchvision's pad takes a 4-element tuple for 2D padding: (left, top, right, bottom)
+            padding = (0, 0, padding_right, padding_bottom)
+            # Use constant padding to match slow implementation
+            image = F.pad(image, padding, fill=fill, padding_mode="constant")
+
+        # Create pixel mask to match the slow implementation
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2FastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list[list["torch.Tensor"]],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: Optional[bool],
+        do_image_splitting: Optional[bool],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Process a batch of images for the model.
+        """
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, is_nested=True, disable_grouping=disable_grouping
+        )
+        split_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_image_splitting:
+                stacked_images = self.split_images(stacked_images)
+            split_images_grouped[shape] = stacked_images
+        split_images = reorder_images(split_images_grouped, grouped_images_index, is_nested=True)
+        if do_image_splitting:
+            # flattenened the doubly nested list to a nested list
+            for i, group_images in enumerate(split_images):
+                split_images[i] = [image for sublist in group_images for image in sublist]
+
+        # Group images by size for further processing
+        grouped_images, grouped_images_index = group_images_by_shape(
+            split_images, is_nested=True, disable_grouping=disable_grouping
+        )
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(stacked_images, size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index, is_nested=True)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(
+            resized_images, is_nested=True, disable_grouping=disable_grouping
+        )
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=True)
+
+        if do_pad:
+            # Get max images per batch
+            max_num_images = max(len(images_) for images_ in processed_images)
+            max_height, max_width = get_max_height_width(processed_images)
+
+            processed_images_padded = torch.zeros(
+                len(processed_images),
+                max_num_images,
+                *(processed_images[0][0].shape[0], max_height, max_width),
+                device=processed_images[0][0].device,
+            )
+            pixel_attention_masks = torch.zeros(
+                len(processed_images),
+                max_num_images,
+                *(max_height, max_width),
+                device=processed_images[0][0].device,
+            )
+            for i, images in enumerate(processed_images):
+                for j, image in enumerate(images):
+                    processed_images_padded[i, j], pixel_attention_masks[i, j] = self.pad(
+                        image, (max_height, max_width)
+                    )
+            processed_images = processed_images_padded
+        if do_pad:
+            data = {"pixel_values": processed_images, "pixel_attention_mask": pixel_attention_masks}
+        elif return_tensors == "pt":
+            data = {"pixel_values": torch.stack([torch.stack(images) for images in processed_images])}
+        else:
+            data = {"pixel_values": processed_images}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Idefics2ImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/modeling_idefics2.py b/phivenv/Lib/site-packages/transformers/models/idefics2/modeling_idefics2.py
new file mode 100644
index 0000000000000000000000000000000000000000..293c0c8df6c72c64d125188e48795b684734511c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics2/modeling_idefics2.py
@@ -0,0 +1,1306 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Idefics2 model."""
+
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModel
+from .configuration_idefics2 import Idefics2Config, Idefics2PerceiverConfig, Idefics2VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Idefics2 model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    """
+)
+class Idefics2BaseModelOutputWithPast(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+        encoder_sequence_length, embed_size_per_head)`.
+        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+        input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Idefics2 causal language model (or autoregressive) outputs.
+    """
+)
+# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics2
+class Idefics2CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://huggingface.co/papers/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side, device=pixel_values.device
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0, device=pixel_values.device
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            h_indices = torch.arange(nb_patches_h, device=position_ids.device, dtype=pixel_values.dtype)
+            w_indices = torch.arange(nb_patches_w, device=position_ids.device, dtype=pixel_values.dtype)
+
+            fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6)
+            fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1)] = pos_ids
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    if hasattr(module, "num_key_value_groups"):
+        key = repeat_kv(key, module.num_key_value_groups)
+        value = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics2Vision
+class Idefics2VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        # Ignore copy
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics2Vision
+class Idefics2VisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        output_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, output_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead with Siglip->Idefics2
+class Idefics2MultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Ignore copy
+        self.mlp = Idefics2MLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            output_size=config.hidden_size,
+        )
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class Idefics2EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics2VisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoder with Siglip->Idefics2
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+
+    Args:
+        config: Idefics2Config
+    """
+
+    def __init__(self, config: Idefics2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Idefics2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@auto_docstring
+class Idefics2PreTrainedModel(PreTrainedModel):
+    config: Idefics2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, Idefics2RMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.MultiheadAttention):
+            module._reset_parameters()  # native torch init
+        elif isinstance(module, Idefics2MultiheadAttentionPoolingHead):
+            module.probe.data.normal_()
+        elif isinstance(module, Idefics2PerceiverResampler):
+            module.latents.data.fill_(1.0)
+
+
+@auto_docstring(
+    custom_intro="""
+    Idefics2 vision encoder model that returnss raw image embeddings.
+    """
+)
+class Idefics2VisionTransformer(Idefics2PreTrainedModel):
+    config: Idefics2VisionConfig
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__(config)
+        embed_dim = config.hidden_size
+
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        patch_attention_mask (`torch.BoolTensor` of shape `(batch_size, num_patches_height, num_patches_width)`, *optional*):
+            The attention mask for the patches.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.config.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        elif not self._use_flash_attention_2:
+            patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Idefics2
+class Idefics2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Idefics2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Idefics2PerceiverAttention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__()
+        self.config = config
+        self.layer_idx = None
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.resampler_n_heads
+        self.head_dim = config.resampler_head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        self.scaling = self.head_dim**-0.5
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.is_causal = False
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        latents: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+
+        Args:
+            latents (`torch.Tensor`): Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
+            context (`torch.Tensor`): Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
+            attention_mask (`torch.Tensor`, *optional*): Tensor of shape [bsz, 1, seq, n_latents] representing attention mask.
+            position_ids (`torch.LongTensor`, *optional*): Tensor of shape [bsz, seq] representing position indices of each input token.
+            past_key_values (`tuple[torch.Tensor]`, *optional*): Tuple of tensors containing cached key and value states.
+            output_attentions (`bool`, *optional*, defaults to `False`): Whether to return attention weights.
+            use_cache (`bool`, *optional*, defaults to `False`): Whether to use past_key_values for caching.
+        """
+        bsz, q_len, _ = latents.size()
+        kv_seq_len = q_len + context.size()[1]
+
+        hidden_states = torch.concat([context, latents], dim=-2)
+
+        queries = self.q_proj(latents)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        values = values.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        past_key_values = getattr(self, "past_key_values", past_key_values)
+
+        if past_key_values is not None:
+            keys, values = past_key_values.update(keys, values, self.layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.attention_dropout,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_values
+
+
+class Idefics2PerceiverLayer(nn.Module):
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.n_latents = config.resampler_n_latents
+        self.depth = config.resampler_depth
+        self.rms_norm_eps = config.rms_norm_eps
+
+        self.input_latents_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
+        self.input_context_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
+        self.self_attn = Idefics2PerceiverAttention(config, layer_idx=layer_idx)
+        self.post_attention_layernorm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
+        self.mlp = Idefics2MLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.hidden_size * 4,
+            output_size=config.hidden_size,
+            hidden_act=config.hidden_act,
+        )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        latents: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = latents
+
+        latents = self.input_latents_norm(latents)
+        context = self.input_context_norm(context)
+
+        latents, self_attn_weights, present_key_value = self.self_attn(
+            latents=latents,
+            context=context,
+            attention_mask=attention_mask,
+        )
+        latents = residual + latents
+        residual = latents
+
+        latents = self.post_attention_layernorm(latents)
+        latents = self.mlp(latents)
+        latents = residual + latents
+
+        outputs = (latents,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+@auto_docstring(
+    custom_intro="""
+    Idefics2 perceiver resampler model that performs `depth` blocks of cross-attention with a fixed
+    """
+)
+class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
+    config: Idefics2PerceiverConfig
+    _supports_sdpa = True
+    _supports_flash_attention_2 = True
+    _supports_flex_attn = True
+
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.hidden_size = config.hidden_size
+        self.hidden_act = config.hidden_act
+        self.n_latents = config.resampler_n_latents
+        self.depth = config.resampler_depth
+        self.rms_norm_eps = config.rms_norm_eps
+
+        # Create Latents for Perceiver
+        self.latents = nn.Parameter(torch.ones(self.n_latents, self.hidden_size))
+
+        # Create Transformer Blocks
+        self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
+        self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
+
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    @auto_docstring
+    def forward(
+        self,
+        context: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        context (`torch.FloatTensor` of shape `(batch, seq_len, embed_dim)`):
+            Input to the layer.
+        """
+        # seq embed -> bsz seq embed
+        latents = self.latents.unsqueeze(0).expand((context.shape[0], *self.latents.size()))
+
+        latent_attention_mask = torch.ones(
+            (attention_mask.size(0), latents.size(1)), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1)
+        attention_mask = (
+            _prepare_4d_attention_mask(attention_mask, latents.dtype, tgt_len=self.n_latents)
+            if not self._use_flash_attention_2
+            else attention_mask
+        )
+
+        compressed_context = latents
+        for perceiver_layer in self.layers:
+            layer_outputs = perceiver_layer(
+                compressed_context,
+                context,
+                attention_mask=attention_mask,
+                position_ids=None,
+                past_key_values=None,
+                output_attentions=False,
+                use_cache=False,
+            )
+
+            compressed_context = layer_outputs[0]
+
+        compressed_context = self.norm(compressed_context)
+
+        return compressed_context
+
+
+class Idefics2Connector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.modality_projection = Idefics2MLP(
+            hidden_size=config.vision_config.hidden_size,
+            intermediate_size=config.text_config.intermediate_size,
+            output_size=config.text_config.hidden_size,
+            hidden_act=config.text_config.hidden_act,
+        )
+        self.perceiver_resampler = Idefics2PerceiverResampler._from_config(config.perceiver_config)
+
+    def forward(self, image_hidden_states, attention_mask):
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        image_hidden_states = self.perceiver_resampler(context=image_hidden_states, attention_mask=attention_mask)
+        return image_hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    Idefics2 model consisting of a SIGLIP vision encoder and Mistral language decoder
+    """
+)
+class Idefics2Model(Idefics2PreTrainedModel):
+    def __init__(self, config: Idefics2Config):
+        super().__init__(config)
+        self.padding_idx = self.config.text_config.pad_token_id
+        self.vocab_size = self.config.text_config.vocab_size
+
+        self.vision_model = Idefics2VisionTransformer._from_config(config.vision_config)
+        self.connector = Idefics2Connector(config)
+        self.text_model = AutoModel.from_config(config.text_config)
+
+        self.image_seq_len = config.perceiver_config.resampler_n_latents
+        self.image_token_id = self.config.image_token_id
+
+        self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2"
+
+        self.post_init()
+
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings.
+
+        This is useful for lora when using gradient checkpointing.
+        c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032
+
+        Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
+        """
+
+        def get_lowest_module(module):
+            if len(list(module.children())) == 0:
+                # If the module has no children, it is a leaf module (e.g., Linear, Conv2d, etc.)
+                return module
+            else:
+                # Recursively call the function on each child module
+                return get_lowest_module(list(module.children())[0])
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = get_lowest_module(self.vision_model).register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def inputs_merger(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: Optional[torch.Tensor],
+        image_hidden_states: Optional[torch.Tensor],
+    ):
+        """
+        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
+        The merging happens as follows:
+        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
+        - We get the image hidden states for the image through the vision encoder (and potentially the perceiver), and that hidden state is then projected into the text embedding space.
+        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
+        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
+        - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        image_hidden_states = image_hidden_states.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_hidden_states)
+        return inputs_embeds
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            pixel_attention_mask (`torch.LongTensor`, *optional*):
+                The attention mask indicating padded regions in the image.
+        """
+        batch_size, num_images, num_channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+        pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+
+        # Remove padding images - padding images are full 0.
+        nb_values_per_image = pixel_values.shape[1:].numel()
+        real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+        pixel_values = pixel_values[real_images_inds].contiguous()
+
+        # Handle the vision attention mask
+        if pixel_attention_mask is None:
+            pixel_attention_mask = torch.ones(
+                size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+        else:
+            # Remove padding images from the mask/pP p
+            pixel_attention_mask = pixel_attention_mask.view(batch_size * num_images, *pixel_attention_mask.shape[2:])
+            pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+
+        patch_size = self.config.vision_config.patch_size
+        patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+        patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+        patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) == patch_size * patch_size).bool()
+        # Get sequence from the vision encoder
+        image_hidden_states = self.vision_model(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+        image_hidden_states = image_hidden_states.last_hidden_state
+
+        # Modality projection & resampling
+        image_hidden_states = self.connector(
+            image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1)
+        )
+        image_hidden_states = image_hidden_states.view(-1, image_hidden_states.shape[-1])
+        return image_hidden_states
+
+    @can_return_tuple
+    @auto_docstring(
+        custom_intro="""
+        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
+        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
+        max_num_images is the maximum number of images among the batch_size samples in the batch.
+
+        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
+        For efficiency, we only pass through the vision_model's forward the real images by
+        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
+        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
+        """
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Idefics2BaseModelOutputWithPast]:
+        r"""
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection and perceiver resampling.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training and self.text_model.gradient_checkpointing and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.text_model.get_input_embeddings()(input_ids)
+
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            image_hidden_states = self.get_image_features(pixel_values, pixel_attention_mask)
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+
+        if image_hidden_states is not None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+
+        outputs = self.text_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=True,
+            **kwargs,
+        )
+
+        return Idefics2BaseModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.
+    """
+)
+class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Idefics2Model(config)
+        self.image_token_id = self.config.image_token_id
+
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
+        the model weights fixed.
+        """
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = self.model.vision_model.get_input_embeddings().register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
+    def get_input_embeddings(self):
+        return self.model.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.text_model.set_input_embeddings(value)
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
+        return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Idefics2CausalLMOutputWithPast]:
+        r"""
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection and perceiver resampling.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics2ForConditionalGeneration`).
+            Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
+            computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from transformers import AutoProcessor, AutoModelForVision2Seq
+        >>> from transformers.image_utils import load_image
+
+        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
+        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
+        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
+        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
+
+        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base")
+        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b-base", device_map="auto")
+
+        >>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+        >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
+
+        >>> # Create inputs
+        >>> prompts = [
+        ...   "<image>In this image, we can see the city of New York, and more specifically the Statue of Liberty.<image>In this image,",
+        ...   "In which city is that bridge located?<image>",
+        ... ]
+        >>> images = [[image1, image2], [image3]]
+        >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda")
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
+        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        >>> print(generated_texts)
+        ['In this image, we can see the city of New York, and more specifically the Statue of Liberty. In this image, we can see the city of New York, and more specifically the Statue of Liberty.\n\n', 'In which city is that bridge located?\n\nThe bridge is located in the city of Pittsburgh, Pennsylvania.\n\n\nThe bridge is']
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=True,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Idefics2CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        pixel_values=None,
+        pixel_attention_mask=None,
+        image_hidden_states=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
+        # precedence is moved to the model, we can remove this fn)
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if image_hidden_states is not None or cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_attention_mask"] = None
+
+        return model_inputs
+
+
+__all__ = ["Idefics2ForConditionalGeneration", "Idefics2PreTrainedModel", "Idefics2Model"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics2/processing_idefics2.py b/phivenv/Lib/site-packages/transformers/models/idefics2/processing_idefics2.py
new file mode 100644
index 0000000000000000000000000000000000000000..751c4693b95ec6404b9ed3b2836cc788bb17ad39
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics2/processing_idefics2.py
@@ -0,0 +1,266 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for IDEFICS2.
+"""
+
+from itertools import accumulate
+from typing import TYPE_CHECKING, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, is_valid_image, load_image
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from ...tokenization_utils_base import AddedToken, TextInput
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import PreTokenizedInput
+
+
+logger = logging.get_logger(__name__)
+
+
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+
+
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+
+
+class Idefics2ImagesKwargs(ImagesKwargs, total=False):
+    image_seq_len: Optional[int]
+
+
+class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics2ImagesKwargs
+
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "is_split_into_words": False,
+        },
+        "images_kwargs": {},
+    }
+
+
+class Idefics2Processor(ProcessorMixin):
+    r"""
+    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
+
+    [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`Idefics2ImageProcessor`):
+            An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
+        tokenizer (`PreTrainedTokenizerBase`, *optional*):
+            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+        image_seq_len (`int`, *optional*, defaults to 64):
+            The length of the image sequence i.e. the number of <image> tokens per image in the input.
+            This parameter is used to build the string from the input prompt and image tokens and should match the
+            config.perceiver_config.resampler_n_latents value for the model used.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Idefics2ImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        if not hasattr(tokenizer, "image_token"):
+            self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
+            self.image_token = AddedToken("<image>", normalized=False, special=True).content
+            tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]}
+            tokenizer.add_special_tokens(tokens_to_add)
+            self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        else:
+            self.fake_image_token = tokenizer.image_boundary_token
+            self.image_token = tokenizer.image_token
+            self.image_token_id = tokenizer.image_token_id
+
+        self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
+        tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]})
+        self.image_seq_len = image_seq_len
+
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def _extract_images_from_prompts(self, prompts):
+        prompt_images = []
+        for prompt in prompts:
+            images = []
+            for elem in prompt:
+                if is_valid_image(elem):
+                    images.append(elem)
+                elif is_url(elem):
+                    images.append(load_image(elem))
+            prompt_images.append(images)
+        return prompt_images
+
+    def __call__(
+        self,
+        images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
+        text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Idefics2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Processes the input prompts and returns a BatchEncoding.
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> from transformers import Idefics2Processor
+        >>> from transformers.image_utils import load_image
+
+        >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
+        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
+
+        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+
+        >>> image1, image2 = load_image(url1), load_image(url2)
+        >>> images = [[image1], [image2]]
+
+        >>> text = [
+        ...     "<image>In this image, we see",
+        ...     "bla bla bla<image>",
+        ... ]
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
+        >>> input_ids = outputs.input_ids
+        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
+        >>> print(input_tokens)
+        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
+        ```
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
+            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+
+                Wherever an image token, `<image>` is encountered it is expanded to
+                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
+            return_tensors (`Union[str, TensorType]`, *optional*):
+                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
+                information.
+
+        """
+        if text is None and images is None:
+            raise ValueError("You must provide either `text` or `images`.")
+
+        output_kwargs = self._merge_kwargs(
+            Idefics2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
+        image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+
+        n_images_in_text = []
+        inputs = {}
+
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
+            fake_image_token = self.fake_image_token
+            image_token = self.image_token
+            image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
+
+            if self.image_processor.do_image_splitting:
+                # A single image token is split into 4 patches + 1 original image
+                image_str = image_str * 5
+                image_seq_len *= 5
+
+            prompt_strings = []
+            for sample in text:
+                n_images_in_text.append(sample.count(image_token))
+                sample = sample.replace(image_token, image_str)
+                # Remove any double fake tokens if images are adjacent
+                sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
+                prompt_strings.append(sample)
+
+            text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+            self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+            inputs.update(text_inputs)
+
+        if images is not None:
+            if is_image_or_image_url(images):
+                images = [[images]]
+            elif isinstance(images, (list, tuple)) and is_image_or_image_url(images[0]):
+                if text is not None:
+                    if sum(n_images_in_text) != len(images):
+                        raise ValueError(
+                            f"The total number of {image_token} tokens in the prompts should be the same as the number of images passed."
+                            f" Found {sum(n_images_in_text)} {image_token} tokens and {len(images)} images."
+                        )
+                    # Reorganize the images to match the prompts
+                    cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
+                    images = [
+                        images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]]
+                        for i in range(len(n_images_in_text))
+                    ]
+                else:
+                    images = [images]
+
+            elif (
+                not isinstance(images, (list, tuple))
+                and not isinstance(images[0], (list, tuple))
+                and not is_image_or_image_url(images[0][0])
+            ):
+                raise ValueError(
+                    "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                )
+
+            n_images_in_images = [len(sample) for sample in images]
+            if text is not None and not n_images_in_images == n_images_in_text:
+                raise ValueError(
+                    f"The number of images in the text {n_images_in_text} and images  {n_images_in_images} should be the same."
+                )
+
+            # Load images if they are URLs
+            images = [[load_image(im) for im in sample] for sample in images]
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            inputs.update(image_inputs)
+
+        return BatchFeature(inputs, tensor_type=return_tensors)
+
+
+__all__ = ["Idefics2Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/__init__.py b/phivenv/Lib/site-packages/transformers/models/idefics3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1dd3bfda7fbce5af140b438ec00f3ff51718ed5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics3/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_idefics3 import *
+    from .image_processing_idefics3 import *
+    from .image_processing_idefics3_fast import *
+    from .modeling_idefics3 import *
+    from .processing_idefics3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89a6047bd17d16ef72c96859a9aee99b3a1b4993
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d79d480b15cf85ad64c5a119291a6da3cc9a1825
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efba4dde11b4952fb193a558c38392794e3963f3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79c88fe92113aef19948d23ec487348aa530f512
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcc1df650d9036e2fd2f9b1eee7e1da8711baf0b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88af01554b50dd7a29aaafb45116f58debd6acd9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/configuration_idefics3.py b/phivenv/Lib/site-packages/transformers/models/idefics3/configuration_idefics3.py
new file mode 100644
index 0000000000000000000000000000000000000000..97a2e57f1d8dc4cc2b8c27ebade7ef1a5109f97f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics3/configuration_idefics3.py
@@ -0,0 +1,190 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Idefics3 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Idefics3VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Idefics3VisionModel`]. It is used to instantiate a
+    Idefics3 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics3 model
+    [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1152):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers.models.idefics3.modeling_idefics3 import Idefics3VisionTransformer
+    >>> from transformers.models.idefics3.configuration_idefics3 import Idefics3VisionConfig
+
+    >>> # Initializing a Idefics3VisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = Idefics3VisionConfig()
+
+    >>> # Initializing a Idefics3VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = Idefics3VisionTransformer(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics3_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+
+
+class Idefics3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Idefics3Model`]. It is used to instantiate a
+    Idefics3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the model of the Idefics3
+    [HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should cache the key/value pairs of the attention mechanism. Only
+            relevant if `config.is_decoder=True`.
+        image_token_id (`int`, *optional*, defaults to 128257):
+            The id of the "image" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the token embeddings.
+        vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
+            Custom vision config or dict for the vision tower
+        text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
+            Custom text config or dict for the text model
+        scale_factor (`int`, *optional*, defaults to 2):
+            The scale factor for the image encoder.
+        pad_token_id (`int`, *optional*, defaults to 128002):
+            The id of the padding token.
+
+    Example:
+    ```python
+    >>> from transformers import Idefics3Model, Idefics3Config
+    >>> # Initializing configuration
+    >>> configuration = Idefics3Config()
+    >>> # Initializing a model from the configuration
+    >>> model = Idefics3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics3"
+    sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig}
+
+    def __init__(
+        self,
+        use_cache=True,
+        image_token_id=128257,
+        tie_word_embeddings=False,
+        vision_config=None,
+        text_config=None,
+        scale_factor=2,
+        pad_token_id=128_002,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+
+        if vision_config is None:
+            self.vision_config = Idefics3VisionConfig()
+            logger.info("vision_config is None, using default vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = Idefics3VisionConfig(**vision_config)
+        elif isinstance(vision_config, Idefics3VisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            logger.info("text_config is None, using default text config")
+            text_config = CONFIG_MAPPING["llama"](
+                rms_norm_eps=1e-5,
+                pad_token_id=pad_token_id,
+                tie_word_embeddings=False,
+            )
+
+        self.text_config = text_config
+        self.scale_factor = scale_factor
+
+        super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings)
+
+
+__all__ = ["Idefics3Config", "Idefics3VisionConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/image_processing_idefics3.py b/phivenv/Lib/site-packages/transformers/models/idefics3/image_processing_idefics3.py
new file mode 100644
index 0000000000000000000000000000000000000000..feae13ab2e45f94de33aa5bcf6950f1fe73b3235
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics3/image_processing_idefics3.py
@@ -0,0 +1,899 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import PaddingMode, pad, to_channel_dimension_format, to_pil_image
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_nested_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
+
+
+if is_vision_available():
+    import PIL
+    from PIL import Image
+
+
+def _resize_output_size_rescale_to_max_len(
+    height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
+) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        min_len (`int`, *optional*, defaults to 1):
+            Minimum size of the output image.
+        max_len (`int`, *optional*, defaults to the maximum size of the image):
+            Maximum size of the output image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+
+    if width >= height:
+        width = max_len
+        height = int(width / aspect_ratio)
+        if height % 2 != 0:
+            height += 1
+    elif height > width:
+        height = max_len
+        width = int(height * aspect_ratio)
+        if width % 2 != 0:
+            width += 1
+
+    # Avoid resizing to a size smaller than min_len
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return height, width
+
+
+def _resize_output_size_scale_below_upper_bound(
+    height: int, width: int, max_len: Optional[dict[str, int]] = None
+) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        max_len (`dict[str, int]`, *optional*, defaults to the maximum size of the image):
+            Defines the maximum dimensions of the image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+
+    aspect_ratio = width / height
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+
+    # Avoid resizing to a size smaller than 1
+    height = max(height, 1)
+    width = max(width, 1)
+    return height, width
+
+
+def get_resize_output_image_size(
+    image,
+    resolution_max_side: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        image (`np.ndarray`):
+            Image to resize.
+        resolution_max_side (`int`):
+            The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
+            input aspect ratio.
+        input_data_format (`ChannelDimension` or `str`):
+            The channel dimension format of the input image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+
+    # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
+    height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
+    # Find the output size when scaling the image to be below the MAX_IMAGE_SIZE
+    height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
+    return height, width
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+def get_max_height_width(
+    images_list: list[list[np.ndarray]], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
+
+    max_height = max_width = float("-inf")
+    for images in images_list:
+        for image in images:
+            height, width = get_image_size(image, channel_dim=input_data_format)
+            max_height = max(height, max_height)
+            max_width = max(width, max_width)
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+def convert_to_rgb(
+    image: np.ndarray,
+    palette: Optional[PIL.ImagePalette.ImagePalette] = None,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> ImageInput:
+    """
+    Converts an image to RGB format.
+    Args:
+        image (`np.ndarray`):
+            The image to convert.
+        palette (list[int], *optional*):
+            The palette to use if given.
+        data_format (ChannelDimension or str, *optional*):
+            The channel dimension format for the output image. If not provided, it will be the same as the input image.
+        input_data_format (ChannelDimension or str, *optional*):
+            The channel dimension format of the input image.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    data_format = input_data_format if data_format is None else data_format
+
+    mode = "P" if palette is not None else None
+    image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
+    if image.mode == "P" and palette is not None:
+        image.putpalette(palette)
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+
+    output_array = np.array(alpha_composite)
+    # The image is always in channels last format after converting from a PIL image
+    output_array = to_channel_dimension_format(output_array, data_format, input_channel_dim=ChannelDimension.LAST)
+    return output_array
+
+
+# FIXME Amy: make a more general crop function that isn't just centre crop
+def _crop(
+    image: np.ndarray,
+    w1: int,
+    h1: int,
+    w2: int,
+    h2: int,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    if data_format is None:
+        data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
+    if data_format == ChannelDimension.FIRST:
+        image = image[:, h1:h2, w1:w2]
+    elif data_format == ChannelDimension.LAST:
+        image = image[h1:h2, w1:w2, :]
+    else:
+        raise ValueError("Invalid channel dimension format.")
+
+    return image
+
+
+class Idefics3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics3 image processor.
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB. This is useful if the input image is of a different format e.g. RGBA.
+            Only has an effect if the input image is in the PIL format.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image. The longest edge of the image is resized to  be <= `size["longest_edge"]`, with the
+            shortest edge resized to keep the input aspect ratio.
+        size (`Dict`, *optional*, defaults to `{"longest_edge": 4 * 364}`):
+            Controls the size of the output image. This is a dictionary containing the key "longest_edge".
+            The image will be resized such that the longest edge is <= `size["longest_edge"]` and the shortest edge is resized
+            to keep the input aspect ratio.
+        resample (`Resampling`, *optional*, defaults to `Resampling.LANCZOS`):
+            Resampling filter to use when resizing the image.
+        do_image_splitting (`bool`, *optional*, defaults to `True`):
+            Whether to split the image into sub-images concatenated with the original image. They are split into patches
+            such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+        max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+            Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1.
+        rescale_factor (`float`, *optional*, defaults to `1/255`):
+            Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. If set to `True`, the image is normalized to have a mean of `image_mean` and
+            a standard deviation of `image_std`.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad the images to the largest height and width in the batch and number of images per
+            sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
+    """
+
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
+
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        do_image_splitting: bool = True,
+        max_image_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_convert_rgb = do_convert_rgb
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"longest_edge": 4 * 364}
+        self.resample = resample
+        self.do_image_splitting = do_image_splitting
+        self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 364}
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_pad = do_pad
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The longest edge of the image is resized to size["longest_edge"], with the shortest edge
+        resized to keep the input aspect ratio. Can also be used with size["height"] and size["width"].
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
+        # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+        # The resized image from PIL will always have channels last, so find the input format first.
+        data_format = input_data_format if data_format is None else data_format
+
+        if "longest_edge" in size:
+            size = get_resize_output_image_size(
+                image, resolution_max_side=size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+
+        image_mode = None
+        if image.ndim == 2 or image.shape[-1] == 1:
+            image_mode = "P"
+        image = to_pil_image(image, image_mode=image_mode, input_data_format=input_data_format)
+
+        resized_image = image.resize((size[1], size[0]), resample=resample)
+        resized_image = np.array(resized_image)
+
+        # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
+        # so we need to add it back if necessary.
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
+        # The image is always in channels last format after converting from a PIL image
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
+        return resized_image
+
+    def split_image(
+        self,
+        image,
+        max_image_size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Split an image into squares of side max_image_size and the original image resized to max_image_size.
+        That means that a single image becomes a sequence of images.
+        This is a "trick" to spend more compute on each image with no changes in the vision encoder.
+        1) If one side of the original image is larger than `max_image_size`, resize it to `max_image_size` while preserving the aspect ratio.
+        2) Divide the resulting image into `ceil(height / max_image_size)` x `ceil(width / max_image_size)`
+        sub-images of the same size each (image_size, image_size). Typically, 364x364.
+        3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
+        Args:
+            image (`np.ndarray`):
+                Images to split.
+            max_image_size (`dict[str, int]`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image, channel_dim=input_data_format)
+        max_height = max_width = max_image_size["longest_edge"]
+
+        frames = []
+        if height > max_height or width > max_width:
+            # Calculate the number of splits
+            num_splits_h = math.ceil(height / max_height)
+            num_splits_w = math.ceil(width / max_width)
+            # Calculate the optimal width and height for the sub-images
+            optimal_height = math.ceil(height / num_splits_h)
+            optimal_width = math.ceil(width / num_splits_w)
+
+            # Iterate through each row and column
+            for r in range(num_splits_h):
+                for c in range(num_splits_w):
+                    # Calculate the starting point of the crop
+                    start_x = c * optimal_width
+                    start_y = r * optimal_height
+
+                    # Calculate the ending point of the crop
+                    end_x = min(start_x + optimal_width, width)
+                    end_y = min(start_y + optimal_height, height)
+
+                    # Crop the image
+                    cropped_image = _crop(
+                        image,
+                        start_x,
+                        start_y,
+                        end_x,
+                        end_y,
+                        data_format=data_format,
+                    )
+                    frames.append(cropped_image)
+
+            # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
+            global_image_height, global_image_width = max_height, max_width
+            if height != global_image_height or width != global_image_width:
+                image = self.resize(
+                    image,
+                    {"height": global_image_height, "width": global_image_width},
+                    resample=resample,
+                    input_data_format=data_format,
+                )
+        else:
+            num_splits_h, num_splits_w = 0, 0
+
+        frames.append(image)
+
+        return frames, num_splits_h, num_splits_w
+
+    def resize_for_vision_encoder(
+        self,
+        image: np.ndarray,
+        vision_encoder_max_size: int,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Resize images to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
+        Args:
+            image (`np.ndarray`):
+                Images to resize.
+            vision_encoder_max_size (`int`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred
+        """
+        height, width = get_image_size(image, channel_dim=input_data_format)
+
+        aspect_ratio = width / height
+        if width >= height:
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            height = int(width / aspect_ratio)
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+        elif height > width:
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            width = int(height * aspect_ratio)
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+        new_size = {"height": height, "width": width}
+        return self.resize(
+            image, size=new_size, resample=resample, input_data_format=input_data_format, data_format=data_format
+        )
+
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    def pad(
+        self,
+        images: list[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        For a list of images, for each images, pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width.
+        For each sample in the batch, pads the sample with empty images to the max_number of images per sample in the batch. Optionally returns a pixel mask.
+        Args:
+            images (`list[np.ndarray]`):
+                List of list of images to pad. Pads to the largest height and width in the batch.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        batch_size = len(images)
+        max_num_images = max(len(images_) for images_ in images)
+        input_data_format = (
+            infer_channel_dimension_format(images[0][0], num_channels=(1, 3, 4))
+            if input_data_format is None
+            else input_data_format
+        )
+        data_format = input_data_format if data_format is None else data_format
+
+        if input_data_format == ChannelDimension.FIRST:
+            n_channels = images[0][0].shape[0]
+        elif input_data_format == ChannelDimension.LAST:
+            n_channels = images[0][0].shape[-1]
+        else:
+            raise ValueError("Invalid channel dimension format.")
+
+        def empty_image(size, input_data_format):
+            if input_data_format == ChannelDimension.FIRST:
+                return np.zeros((n_channels, *size), dtype=np.uint8)
+            elif input_data_format == ChannelDimension.LAST:
+                return np.zeros((*size, n_channels), dtype=np.uint8)
+
+        padded_images_list = [
+            [empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size)
+        ]
+        padded_masks = [[np.zeros(pad_size, dtype=np.int64) for _ in range(max_num_images)] for _ in range(batch_size)]
+
+        for batch_idx in range(batch_size):
+            for sample_idx, image in enumerate(images[batch_idx]):
+                padded_images_list[batch_idx][sample_idx] = self._pad_image(
+                    image,
+                    pad_size,
+                    constant_values=constant_values,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                padded_masks[batch_idx][sample_idx] = make_pixel_mask(
+                    image, output_size=pad_size, input_data_format=input_data_format
+                )
+
+        padded_masks = padded_masks if return_pixel_mask else None
+        return padded_images_list, padded_masks
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_convert_rgb: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_image_splitting: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        max_image_size: Optional[dict[str, int]] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_row_col_info: bool = False,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess a batch of images.
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. With the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_image_splitting (`bool`, *optional*, defaults to `self.do_image_splitting`):
+                Whether to split the image into sub-images concatenated with the original image. They are split into patches
+                such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+            max_image_size (`Dict`, *optional*, defaults to `self.max_image_size`):
+                Maximum resolution of the images. If the image is larger than this size, the image is split into patches.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether or not to pad the images to the largest height and width in the batch.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            return_row_col_info (`bool`, *optional*, default to `False`):
+                Whether to return the number of rows and columns of the split images. This is used for the
+                `Idefics3Processor` to generate prompt strings based on the number of rows and columns.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting
+        max_image_size = max_image_size if max_image_size is not None else self.max_image_size
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pad = do_pad if do_pad is not None else self.do_pad
+
+        images = self.fetch_images(images)
+        images_list = make_nested_list_of_images(images)
+
+        if not valid_images(images_list[0]):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # save the palettes for conversion to RGB
+        palettes_list = [
+            [im.getpalette() if isinstance(im, Image.Image) and im.mode == "P" else None for im in images]
+            for images in images_list
+        ]
+
+        # All transformations expect numpy arrays.
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+
+        # Extra channel dimension for grayscale images
+        if input_data_format in [ChannelDimension.LAST, None]:
+            images_list = [
+                [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
+        elif input_data_format == ChannelDimension.FIRST:
+            images_list = [
+                [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
+
+        if do_rescale and is_scaled_image(images_list[0][0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        # We assume that all images have the same channel dimension format.
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
+
+        if do_resize:
+            images_list = [
+                [
+                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        if do_image_splitting:
+            # We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
+            # for size=(10, max_image_size) -> rescaled_size=(max_image_size, max_image_size)
+            # for size=(11, max_image_size+1) -> rescaled_size=(max_image_size, max_image_size*2)
+            images_list = [
+                [
+                    self.resize_for_vision_encoder(
+                        image, max_image_size["longest_edge"], resample=resample, input_data_format=input_data_format
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
+            images_list_split_arrays = []
+            palettes_list_split_arrays = []
+            images_list_rows = []
+            images_list_cols = []
+            for images, palettes in zip(images_list, palettes_list):
+                split_image_arrays = []
+                split_palettes_arrays = []
+                image_rows = []
+                image_cols = []
+                for image, palette in zip(images, palettes):
+                    split_image_array, rows, cols = self.split_image(
+                        image,
+                        max_image_size=max_image_size,
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+                    split_image_arrays.extend(split_image_array)
+                    split_palettes_arrays.extend([palette] * len(split_image_array))
+                    image_rows.append(rows)
+                    image_cols.append(cols)
+                images_list_split_arrays.append(split_image_arrays)
+                palettes_list_split_arrays.append(split_palettes_arrays)
+                images_list_rows.append(image_rows)
+                images_list_cols.append(image_cols)
+            images_list = images_list_split_arrays
+            palettes_list = palettes_list_split_arrays
+        else:
+            # We square the images to max_image_size
+            images_list = [
+                [
+                    self.resize(
+                        image=image,
+                        size={"height": max_image_size["longest_edge"], "width": max_image_size["longest_edge"]},
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
+            images_list_rows = [[0] * len(images) for images in images_list]
+            images_list_cols = [[0] * len(images) for images in images_list]
+
+        if do_convert_rgb:
+            images_list = [
+                [convert_to_rgb(img, palette) for img, palette in zip(images, palettes)]
+                for images, palettes in zip(images_list, palettes_list)
+            ]
+
+        if do_rescale:
+            images_list = [
+                [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+                for images in images_list
+            ]
+
+        if do_normalize:
+            images_list = [
+                [
+                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        pixel_attention_mask = None
+        if do_pad:
+            images_list, pixel_attention_mask = self.pad(
+                images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
+            )
+
+        if data_format is not None:
+            images_list = [
+                [
+                    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        # Faster tensor conversion
+        data = {"pixel_values": np.array(images_list) if do_pad and return_tensors is not None else images_list}
+        if pixel_attention_mask is not None:
+            data["pixel_attention_mask"] = (
+                np.array(pixel_attention_mask) if do_pad and return_tensors is not None else pixel_attention_mask
+            )
+
+        encoding = BatchFeature(data=data, tensor_type=return_tensors)
+
+        # This is needed for generating correct text inputs in the processor - we don't pad to the max number of images
+        if return_row_col_info:
+            encoding["rows"] = images_list_rows
+            encoding["cols"] = images_list_cols
+
+        return encoding
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        do_image_splitting = images_kwargs.get("do_image_splitting", self.do_image_splitting)
+        max_image_size = images_kwargs.get("max_image_size", self.max_image_size)
+        size = images_kwargs.get("size", self.size)
+
+        num_patches = num_rows = num_cols = 1
+        if do_image_splitting:
+            height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
+            height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=4096)
+            aspect_ratio = width / height
+
+            if width >= height:
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int(width / aspect_ratio)
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+            elif height > width:
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int(height * aspect_ratio)
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+
+            max_height = max_width = max_image_size["longest_edge"]
+            if resized_height > max_height or resized_width > max_width:
+                # Calculate the number of splits
+                num_rows = math.ceil(resized_height / max_height)
+                num_cols = math.ceil(resized_width / max_width)
+                num_patches = num_rows * num_cols + 1
+
+        return num_patches, num_rows, num_cols
+
+
+__all__ = ["Idefics3ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/image_processing_idefics3_fast.py b/phivenv/Lib/site-packages/transformers/models/idefics3/image_processing_idefics3_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..b70829f5b43f60eb5c76ba078848d4693c1f67b3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics3/image_processing_idefics3_fast.py
@@ -0,0 +1,546 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+from typing import Optional, Union
+
+import torch
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    make_nested_list_of_images,
+)
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
+
+
+if is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
+
+
+def _resize_output_size_rescale_to_max_len(
+    height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
+) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        min_len (`int`, *optional*, defaults to 1):
+            Minimum size of the output image.
+        max_len (`int`, *optional*, defaults to the maximum size of the image):
+            Maximum size of the output image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+
+    if width >= height:
+        width = max_len
+        height = int(width / aspect_ratio)
+        if height % 2 != 0:
+            height += 1
+    elif height > width:
+        height = max_len
+        width = int(height * aspect_ratio)
+        if width % 2 != 0:
+            width += 1
+
+    # Avoid resizing to a size smaller than min_len
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return height, width
+
+
+def _resize_output_size_scale_below_upper_bound(
+    height: int, width: int, max_len: Optional[dict[str, int]] = None
+) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
+            Defines the maximum dimensions of the image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+
+    aspect_ratio = width / height
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+
+    # Avoid resizing to a size smaller than 1
+    height = max(height, 1)
+    width = max(width, 1)
+    return height, width
+
+
+def get_resize_output_image_size(
+    image,
+    resolution_max_side: int,
+) -> tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        image (`torch.Tensor`):
+            Image to resize.
+        resolution_max_side (`int`):
+            The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
+            input aspect ratio.
+    Returns:
+        The output size of the image after resizing.
+    """
+    height, width = image.size()[-2:]
+
+    # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
+    height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
+    # Find the output size when scaling the image to be below the MAX_IMAGE_SIZE
+    height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
+    return height, width
+
+
+def get_max_height_width(images_list: list[list["torch.Tensor"]]) -> tuple[int, int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    image_sizes = []
+    for images in images_list:
+        for image in images:
+            image_sizes.append(image.size()[-2:])
+
+    max_height = max(size[0] for size in image_sizes)
+    max_width = max(size[1] for size in image_sizes)
+    return (max_height, max_width)
+
+
+def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "torch.Tensor":
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`torch.Tensor`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = image.size()[-2:]
+    mask = torch.zeros(output_size, dtype=torch.int64, device=image.device)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+class Idefics3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    do_pad (`bool`, *optional*):
+        Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+        number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+    do_image_splitting (`bool`, *optional*, defaults to `True`):
+        Whether to split the image into sub-images concatenated with the original image. They are split into patches
+        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
+    return_row_col_info (`bool`, *optional*, defaults to `False`):
+        Whether to return the row and column information of the images.
+    """
+
+    do_pad: Optional[bool]
+    do_image_splitting: Optional[bool]
+    max_image_size: Optional[dict[str, int]]
+    return_row_col_info: Optional[bool]
+
+
+@auto_docstring
+class Idefics3ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.LANCZOS
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"longest_edge": 4 * 364}
+    max_image_size = {"longest_edge": 364}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    do_image_splitting = True
+    do_pad = True
+    return_row_col_info = False
+    valid_kwargs = Idefics3FastImageProcessorKwargs
+
+    def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
+        """
+        Prepare a nested images structure for processing.
+        """
+        return make_nested_list_of_images(images, expected_ndims=expected_ndims)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Resize an image. The longest edge of the image is resized to size.longest_edge, with the shortest edge
+        resized to keep the input aspect ratio. Can also be used with size.height and size.width.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+            antialias (`bool`, *optional*, defaults to `True`):
+                Whether to use antialiasing when resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if interpolation == F.InterpolationMode.LANCZOS:
+            logger.warning_once(
+                "You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. "
+                "BICUBIC resample will be used as an alternative. Please fall back to slow image processor if you "
+                "want full consistency with the original model."
+            )
+            interpolation = F.InterpolationMode.BICUBIC
+
+        if size.longest_edge:
+            size = get_resize_output_image_size(image, resolution_max_side=size.longest_edge)
+        elif size.height and size.width:
+            size = (size.height, size.width)
+        else:
+            raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+
+        return F.resize(image, size, interpolation=interpolation, antialias=antialias)
+
+    def split_images(
+        self,
+        images: torch.Tensor,
+        max_image_size: dict[str, int],
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Split an image into squares of side max_image_size and the original image resized to max_image_size.
+        That means that a single image becomes a sequence of images.
+        This is a "trick" to spend more compute on each image with no changes in the vision encoder.
+        1) If one side of the original image is larger than `max_image_size`, resize it to `max_image_size` while preserving the aspect ratio.
+        2) Divide the resulting image into `ceil(height / max_image_size)` x `ceil(width / max_image_size)`
+        sub-images of the same size each (image_size, image_size). Typically, 364x364.
+        3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
+        Args:
+            images (`torch.Tensor`):
+                Images to split.
+            max_image_size (`Dict[str, int]`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+        """
+        batch_size, num_channels, height, width = images.size()
+        height_dim, width_dim = 2, 3
+
+        max_height = max_width = max_image_size["longest_edge"]
+
+        frames = []
+        if height > max_height or width > max_width:
+            # Calculate the number of splits
+            num_splits_h = math.ceil(height / max_height)
+            num_splits_w = math.ceil(width / max_width)
+
+            # Split the images by height, then by width
+            frames = (
+                images.unfold(height_dim, size=max_height, step=max_height)
+                .unfold(width_dim, size=max_width, step=max_width)
+                .contiguous()
+                .view(batch_size, num_channels, -1, max_height, max_width)
+                .permute(0, 2, 1, 3, 4)
+            )  # batch_size x n_frames x num_channels x height x width
+
+            # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
+            global_image_height, global_image_width = max_height, max_width
+            images = self.resize(
+                images, SizeDict(height=global_image_height, width=global_image_width), interpolation=interpolation
+            )
+
+            frames = torch.cat((frames, images.unsqueeze(1)), dim=1)
+        else:
+            num_splits_h, num_splits_w = 0, 0
+            frames = images.unsqueeze(1)
+
+        num_splits_h = [num_splits_h] * batch_size
+        num_splits_w = [num_splits_w] * batch_size
+
+        return frames, num_splits_h, num_splits_w
+
+    def resize_for_vision_encoder(
+        self,
+        image: torch.Tensor,
+        vision_encoder_max_size: int,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resize images to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
+        Args:
+            image (`torch.Tensor`):
+                Images to resize.
+            vision_encoder_max_size (`int`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+        """
+        height, width = image.size()[-2:]
+
+        aspect_ratio = width / height
+        if width >= height:
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            height = int(width / aspect_ratio)
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+        elif height > width:
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            width = int(height * aspect_ratio)
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+        new_size = SizeDict(height=height, width=width)
+        return self.resize(image, size=new_size, interpolation=interpolation)
+
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: tuple[int, int],
+        fill: int = 0,
+        return_pixel_mask: bool = True,
+    ):
+        original_size = image.shape[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+
+        # Only pad if necessary
+        if original_size != padded_size:
+            padding = (0, 0, padding_right, padding_bottom)
+            image = F.pad(image, padding, fill=fill, padding_mode="constant")
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = None
+        if return_pixel_mask:
+            pixel_mask = torch.zeros_like(image[..., 0, :, :], dtype=torch.int64)
+            pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3FastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list[list["torch.Tensor"]],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: Optional[bool],
+        do_image_splitting: Optional[bool],
+        max_image_size: Optional[dict[str, int]],
+        return_row_col_info: Optional[bool],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Process a batch of images for the model.
+        """
+
+        grouped_images, grouped_images_index = group_images_by_shape(
+            images, is_nested=True, disable_grouping=disable_grouping
+        )
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(stacked_images, size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index, is_nested=True)
+
+        grouped_images, grouped_images_index = group_images_by_shape(
+            resized_images, is_nested=True, disable_grouping=disable_grouping
+        )
+        split_images_grouped = {}
+        if do_image_splitting:
+            rows_grouped = {}
+            cols_grouped = {}
+            for shape, stacked_images in grouped_images.items():
+                stacked_images = self.resize_for_vision_encoder(
+                    stacked_images, max_image_size["longest_edge"], interpolation=interpolation
+                )
+                stacked_images, rows, cols = self.split_images(
+                    stacked_images, max_image_size=max_image_size, interpolation=interpolation
+                )
+                split_images_grouped[shape] = stacked_images
+                rows_grouped[shape] = rows
+                cols_grouped[shape] = cols
+            processed_images = reorder_images(split_images_grouped, grouped_images_index, is_nested=True)
+            rows = reorder_images(rows_grouped, grouped_images_index, is_nested=True)
+            cols = reorder_images(cols_grouped, grouped_images_index, is_nested=True)
+            # flattenened the doubly nested list to a nested list
+            for i, group_images in enumerate(processed_images):
+                processed_images[i] = [image for sublist in group_images for image in sublist]
+        else:
+            for shape, stacked_images in grouped_images.items():
+                # We square the images to max_image_size
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=SizeDict(height=max_image_size["longest_edge"], width=max_image_size["longest_edge"]),
+                    interpolation=interpolation,
+                )
+                split_images_grouped[shape] = stacked_images
+            processed_images = reorder_images(split_images_grouped, grouped_images_index, is_nested=True)
+            rows = [[0] * len(images) for images in processed_images]
+            cols = [[0] * len(images) for images in processed_images]
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(
+            processed_images, is_nested=True, disable_grouping=disable_grouping
+        )
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=True)
+        if do_pad:
+            # Get max images per batch
+            max_num_images = max(len(images_) for images_ in processed_images)
+            max_height, max_width = get_max_height_width(processed_images)
+
+            processed_images_padded = torch.zeros(
+                len(processed_images),
+                max_num_images,
+                *(processed_images[0][0].shape[0], max_height, max_width),
+                device=processed_images[0][0].device,
+            )
+            pixel_attention_masks = torch.zeros(
+                len(processed_images),
+                max_num_images,
+                *(max_height, max_width),
+                device=processed_images[0][0].device,
+            )
+            for i, images in enumerate(processed_images):
+                for j, image in enumerate(images):
+                    processed_images_padded[i, j], pixel_attention_masks[i, j] = self.pad(
+                        image, (max_height, max_width)
+                    )
+            processed_images = processed_images_padded
+
+        if do_pad:
+            data = {"pixel_values": processed_images, "pixel_attention_mask": pixel_attention_masks}
+        elif return_tensors == "pt":
+            data = {"pixel_values": torch.stack([torch.stack(images) for images in processed_images])}
+        else:
+            data = {"pixel_values": processed_images}
+        # This is needed for generating correct text inputs in the processor - we don't pad to the max number of images
+        encoding = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if return_row_col_info:
+            encoding["rows"] = rows
+            encoding["cols"] = cols
+
+        return encoding
+
+    def to_dict(self):
+        encoder_dict = super().to_dict()
+        encoder_dict.pop("_valid_processor_keys", None)
+        encoder_dict.pop("return_row_col_info", None)
+        return encoder_dict
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        do_image_splitting = images_kwargs.get("do_image_splitting", self.do_image_splitting)
+        max_image_size = images_kwargs.get("max_image_size", self.max_image_size)
+        size = images_kwargs.get("size", self.size)
+
+        num_patches = num_rows = num_cols = 1
+        if do_image_splitting:
+            height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
+            height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
+            aspect_ratio = width / height
+
+            if width >= height:
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int(width / aspect_ratio)
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+            elif height > width:
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int(height * aspect_ratio)
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+
+            max_height = max_width = max_image_size["longest_edge"]
+            if resized_height > max_height or resized_width > max_width:
+                # Calculate the number of splits
+                num_rows = math.ceil(resized_height / max_height)
+                num_cols = math.ceil(resized_width / max_width)
+                num_patches = num_rows * num_cols + 1
+
+        return num_patches, num_rows, num_cols
+
+
+__all__ = ["Idefics3ImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/modeling_idefics3.py b/phivenv/Lib/site-packages/transformers/models/idefics3/modeling_idefics3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b9b751865fffd074ade0083747b2ba1c38230cc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics3/modeling_idefics3.py
@@ -0,0 +1,1047 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Idefics3 model."""
+
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..auto import AutoModel
+from .configuration_idefics3 import Idefics3Config, Idefics3VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Idefics3 model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    """
+)
+class Idefics3BaseModelOutputWithPast(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+        encoder_sequence_length, embed_size_per_head)`.
+        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+        input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Idefics causal language model (or autoregressive) outputs.
+    """
+)
+class Idefics3CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionEmbeddings with Idefics2->Idefics3
+class Idefics3VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://huggingface.co/papers/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: Idefics3VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side, device=pixel_values.device
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0, device=pixel_values.device
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            h_indices = torch.arange(nb_patches_h, device=position_ids.device, dtype=pixel_values.dtype)
+            w_indices = torch.arange(nb_patches_w, device=position_ids.device, dtype=pixel_values.dtype)
+
+            fractional_coords_h = h_indices / nb_patches_h * (1 - 1e-6)
+            fractional_coords_w = w_indices / nb_patches_w * (1 - 1e-6)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1)] = pos_ids
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics3Vision
+class Idefics3VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        # Ignore copy
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics3Vision
+class Idefics3VisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics3SimpleMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
+        output_size = config.text_config.hidden_size
+        self.proj = nn.Linear(input_size, output_size, bias=False)
+
+    def forward(self, x):
+        return self.proj(x)
+
+
+# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2EncoderLayer with Idefics2->Idefics3
+class Idefics3EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Idefics3VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics3VisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics3VisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoder with Siglip->Idefics3
+class Idefics3Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Idefics3EncoderLayer`].
+
+    Args:
+        config: Idefics3Config
+    """
+
+    def __init__(self, config: Idefics3Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Idefics3EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Idefics3
+class Idefics3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Idefics3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Idefics3Connector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.scale_factor = config.scale_factor
+        self.modality_projection = Idefics3SimpleMLP(config)
+
+    def pixel_shuffle(self, x, scale_factor=2):
+        bsz, seq, embed_dim = x.size()
+        height = width = int(seq**0.5)
+        x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
+        return x
+
+    def forward(self, image_hidden_states):
+        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        return image_hidden_states
+
+
+@auto_docstring
+class Idefics3PreTrainedModel(PreTrainedModel):
+    config: Idefics3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Idefics3VisionAttention", "Idefics3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, Idefics3RMSNorm):
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring(
+    custom_intro="""
+    The Idefics3 Vision Transformer Model outputting raw image embedding.
+    """
+)
+class Idefics3VisionTransformer(Idefics3PreTrainedModel):
+    config: Idefics3VisionConfig
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+
+    def __init__(self, config: Idefics3VisionConfig):
+        super().__init__(config)
+        embed_dim = config.hidden_size
+
+        self.embeddings = Idefics3VisionEmbeddings(config)
+        self.encoder = Idefics3Encoder(config)
+        self.patch_size = config.patch_size
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not self._use_flash_attention_2:
+            patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+        elif not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Idefics3 model consisting of a SIGLIP vision encoder and Llama3 language decoder
+    """
+)
+class Idefics3Model(Idefics3PreTrainedModel):
+    def __init__(self, config: Idefics3Config):
+        super().__init__(config)
+        self.padding_idx = self.config.text_config.pad_token_id
+        self.vocab_size = self.config.text_config.vocab_size
+
+        self.vision_model = Idefics3VisionTransformer._from_config(config.vision_config)
+        self.connector = Idefics3Connector(config)
+        self.text_model = AutoModel.from_config(config.text_config)
+
+        self.image_seq_len = int(
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2)
+        )
+        self.image_token_id = self.config.image_token_id
+
+        self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2"
+
+        self.post_init()
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.enable_input_require_grads
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings.
+
+        This is useful for lora when using gradient checkpointing.
+        c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032
+
+        Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
+        """
+
+        def get_lowest_module(module):
+            if len(list(module.children())) == 0:
+                # If the module has no children, it is a leaf module (e.g., Linear, Conv2d, etc.)
+                return module
+            else:
+                # Recursively call the function on each child module
+                return get_lowest_module(list(module.children())[0])
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = get_lowest_module(self.vision_model).register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.disable_input_require_grads
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def inputs_merger(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: Optional[torch.Tensor],
+        image_hidden_states: Optional[torch.Tensor],
+    ):
+        """
+        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
+        The merging happens as follows:
+        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
+        - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
+        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
+        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
+        - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        image_hidden_states = image_hidden_states.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_hidden_states)
+        return inputs_embeds
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            pixel_attention_mask (`torch.LongTensor`, *optional*):
+                The attention mask indicating padded regions in the image.
+        """
+        batch_size, num_images, num_channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+        pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+
+        # Remove padding images - padding images are full 0.
+        nb_values_per_image = pixel_values.shape[1:].numel()
+        real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+        pixel_values = pixel_values[real_images_inds].contiguous()
+
+        # Handle the vision attention mask
+        if pixel_attention_mask is None:
+            pixel_attention_mask = torch.ones(
+                size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+        else:
+            # Remove padding images from the mask
+            pixel_attention_mask = pixel_attention_mask.view(batch_size * num_images, *pixel_attention_mask.shape[2:])
+            pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+
+        patch_size = self.config.vision_config.patch_size
+        patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+        patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+        patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+        # Get sequence from the vision encoder
+        image_hidden_states = self.vision_model(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+        image_hidden_states.last_hidden_state
+
+        # Modality projection & resampling
+        image_hidden_states = self.connector(image_hidden_states.last_hidden_state)
+        return image_hidden_states
+
+    @can_return_tuple
+    @auto_docstring(
+        custom_intro="""
+        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
+        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
+        max_num_images is the maximum number of images among the batch_size samples in the batch.
+        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
+        For efficiency, we only pass through the vision_model's forward the real images by
+        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
+        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
+        """
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Idefics3BaseModelOutputWithPast]:
+        r"""
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training and self.text_model.gradient_checkpointing and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(self.device)
+
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            image_hidden_states = self.get_image_features(pixel_values, pixel_attention_mask)
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+
+        if image_hidden_states is not None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+
+        outputs = self.text_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=True,
+            **kwargs,
+        )
+
+        return Idefics3BaseModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Idefics3 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.
+    """
+)
+class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.__init__ with Idefics2->Idefics3
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Idefics3Model(config)
+        self.image_token_id = self.config.image_token_id
+
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.enable_input_require_grads
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
+        the model weights fixed.
+        """
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = self.model.vision_model.get_input_embeddings().register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.disable_input_require_grads
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.text_model.get_input_embeddings()
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.text_model.set_input_embeddings(value)
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
+        return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Idefics3CausalLMOutputWithPast]:
+        r"""
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
+            Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
+            computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from transformers import AutoProcessor, AutoModelForVision2Seq
+        >>> from transformers.image_utils import load_image
+
+        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
+        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
+        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
+        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
+
+        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
+        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", dtype=torch.bfloat16, device_map="auto")
+
+        >>> # Create inputs
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {"type": "image"},
+        ...             {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
+        ...             {"type": "image"},
+        ...             {"type": "text", "text": "What can we see in this image?"},
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {"type": "image"},
+        ...             {"type": "text", "text": "In which city is that bridge located?"},
+        ...         ]
+        ...     }
+        ... ]
+
+        >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
+        >>> images = [[image1, image2], [image3]]
+        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
+        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        >>> print(generated_texts[0])
+        Assistant: There are buildings, trees, lights, and water visible in this image.
+
+        >>> print(generated_texts[1])
+        Assistant: The bridge is in San Francisco.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=True,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Idefics3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        pixel_values=None,
+        pixel_attention_mask=None,
+        image_hidden_states=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
+        # precedence is moved to the model, we can remove this fn)
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if image_hidden_states is not None or cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_attention_mask"] = None
+
+        return model_inputs
+
+
+__all__ = ["Idefics3ForConditionalGeneration", "Idefics3PreTrainedModel", "Idefics3Model", "Idefics3VisionTransformer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/idefics3/processing_idefics3.py b/phivenv/Lib/site-packages/transformers/models/idefics3/processing_idefics3.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e32c5f92443d95f680815883105c584a9a53b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/idefics3/processing_idefics3.py
@@ -0,0 +1,412 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Idefics3.
+"""
+
+import re
+from itertools import accumulate
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, is_valid_image, load_image
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import PreTokenizedInput
+
+logger = logging.get_logger(__name__)
+
+
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+
+
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+
+
+def _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token):
+    """Prompt with expanded image tokens for when the image is split into patches."""
+    text_split_images = ""
+    for n_h in range(image_rows):
+        for n_w in range(image_cols):
+            text_split_images += (
+                f"{fake_token_around_image}" + f"<row_{n_h + 1}_col_{n_w + 1}>" + f"{image_token}" * image_seq_len
+            )
+        text_split_images += "\n"
+
+    text_split_images += (
+        f"\n{fake_token_around_image}"
+        + f"{global_img_token}"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
+    )
+    return text_split_images
+
+
+def _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_img_token):
+    """Prompt with expanded image tokens for a single image."""
+    return (
+        f"{fake_token_around_image}"
+        + f"{global_img_token}"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
+    )
+
+
+def get_image_prompt_string(
+    image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_img_token
+):
+    if image_rows == 0 and image_cols == 0:
+        return _prompt_single_image(
+            image_seq_len,
+            fake_token_around_image=fake_token_around_image,
+            image_token=image_token,
+            global_img_token=global_img_token,
+        )
+    return _prompt_split_image(
+        image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token
+    )
+
+
+class Idefics3ImagesKwargs(ImagesKwargs, total=False):
+    return_row_col_info: Optional[bool]
+    max_image_size: Optional[dict[str, int]]
+
+
+class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics3ImagesKwargs
+
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "is_split_into_words": False,
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {
+            "return_row_col_info": True,
+        },
+    }
+
+
+Idefics3ProcessorKwargs.__annotations__["images_kwargs"] = Idefics3ImagesKwargs  # python 3.8 compatibility
+
+
+class Idefics3Processor(ProcessorMixin):
+    r"""
+    Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor.
+
+    [`Idefics3Processor`] offers all the functionalities of [`Idefics3ImageProcessor`] and [`Idefics3TokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`Idefics3ImageProcessor`):
+            An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
+        tokenizer (`PreTrainedTokenizerBase`, *optional*):
+            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+        image_seq_len (`int`, *optional*, defaults to 169):
+            The length of the image sequence i.e. the number of <image> tokens per image in the input.
+            This parameter is used to build the string from the input prompt and image tokens and should match the
+            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Idefics3ImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
+        self.image_token = AddedToken("<image>", normalized=False, special=True).content
+        self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
+        self.global_image_tag = "<global-img>"  # https://github.com/huggingface/transformers/pull/32473/files/8063e5e17362571b693f1db95167f5443a3be1b2#r1734825341
+        self.image_seq_len = image_seq_len
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.fake_image_token_id = tokenizer.convert_tokens_to_ids(self.fake_image_token)
+        self.global_image_token_id = tokenizer.convert_tokens_to_ids(self.global_image_tag)
+        self.row_col_ids = [
+            tokenizer.convert_tokens_to_ids(f"<row_{i + 1}_col_{j + 1}>") for i in range(6) for j in range(6)
+        ]
+
+        # This regex matches one or more occurrences of <global-img> tags (optionally surrounded by newline characters)
+        # or <row_x_col_y> tags (where x and y are digits, also optionally surrounded by newline characters).
+        self._regex_to_remove_extra_special_tokens = re.compile(r"(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+")
+
+        tokens_to_add = {
+            "additional_special_tokens": [
+                self.fake_image_token,
+                self.image_token,
+                self.end_of_utterance_token,
+            ]
+        }
+        tokenizer.add_special_tokens(tokens_to_add)
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+
+        super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
+
+    def _extract_images_from_prompts(self, prompts):
+        prompt_images = []
+        for prompt in prompts:
+            images = []
+            for elem in prompt:
+                if is_valid_image(elem):
+                    images.append(elem)
+                elif is_url(elem):
+                    images.append(load_image(elem))
+            prompt_images.append(images)
+        return prompt_images
+
+    def __call__(
+        self,
+        images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
+        text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
+        audio=None,
+        videos=None,
+        image_seq_len: Optional[int] = None,
+        **kwargs: Unpack[Idefics3ProcessorKwargs],
+    ) -> BatchEncoding:
+        """
+        Processes the input prompts and returns a BatchEncoding.
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> from transformers import Idefics3Processor
+        >>> from transformers.image_utils import load_image
+
+        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
+        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
+
+        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+
+        >>> image1, image2 = load_image(url1), load_image(url2)
+        >>> images = [[image1], [image2]]
+
+        >>> text = [
+        ...     "<image>In this image, we see",
+        ...     "bla bla bla<image>",
+        ... ]
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
+        >>> input_ids = outputs.input_ids
+        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
+        >>> print(input_tokens)
+        ['<|begin_of_text|><fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image> In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla<fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image>']
+        ```
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
+            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                Wherever an image token, `<image>` is encountered it is expanded to
+                `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
+            image_seq_len (`int`, *optional*):
+                The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
+                image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
+            return_tensors (`Union[str, TensorType]`, *optional*):
+                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
+                information.
+        """
+        if text is None and images is None:
+            raise ValueError("You must provide either `text` or `images`.")
+
+        output_kwargs = self._merge_kwargs(
+            Idefics3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+
+        n_images_in_text = []
+        n_images_in_images = []
+        inputs = {}
+
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+            n_images_in_text = [sample.count(self.image_token) for sample in text]
+
+        if images is not None:
+            if is_image_or_image_url(images):
+                images = [[images]]
+            elif isinstance(images, (list, tuple)) and is_image_or_image_url(images[0]):
+                if text is not None:
+                    if sum(n_images_in_text) != len(images):
+                        raise ValueError(
+                            f"The total number of {self.image_token} tokens in the prompts should be the same as the number of images passed."
+                            f" Found {sum(n_images_in_text)} {self.image_token} tokens and {len(images)} images."
+                        )
+                    # Reorganize the images to match the prompts
+                    cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
+                    images = [
+                        images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]]
+                        for i in range(len(n_images_in_text))
+                    ]
+                else:
+                    images = [images]
+            elif (
+                not isinstance(images, (list, tuple))
+                and not isinstance(images[0], (list, tuple))
+                and not is_image_or_image_url(images[0][0])
+            ):
+                raise ValueError(
+                    "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                )
+            n_images_in_images = [len(sample) for sample in images]
+
+            # Load images if they are URLs
+            images = [[load_image(im) if is_url(im) else im for im in sample] for sample in images]
+
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            inputs.update(image_inputs)
+
+            if text is not None:
+                if n_images_in_images != n_images_in_text:
+                    raise ValueError(
+                        f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
+                    )
+
+                image_rows = inputs.pop("rows", [[0] * len(text)])
+                image_cols = inputs.pop("cols", [[0] * len(text)])
+
+                fake_image_token = self.fake_image_token
+                image_token = self.image_token
+                global_img_token = self.global_image_tag
+
+                prompt_strings = []
+                batch_image_seq_lengths = []
+                for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+                    # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
+                    image_prompt_strings = []
+                    image_seq_lengths = []
+                    for n_rows, n_cols in zip(sample_rows, sample_cols):
+                        image_prompt_string = get_image_prompt_string(
+                            n_rows,
+                            n_cols,
+                            image_seq_len,
+                            image_token=image_token,
+                            fake_token_around_image=fake_image_token,
+                            global_img_token=global_img_token,
+                        )
+                        # Add +2 and +3 for special BOI/EOI/fake_image_wrapper tokens
+                        row_length = (self.image_seq_len + 2) * n_cols + 1
+                        image_seq_lengths.append((self.image_seq_len + 3) + row_length * n_rows)
+                        image_prompt_strings.append(image_prompt_string)
+
+                    batch_image_seq_lengths.append(image_seq_lengths)
+                    split_sample = sample.split(image_token)
+                    if len(split_sample) == 0:
+                        raise ValueError("The image token should be present in the text.")
+
+                    # Place in the image prompt strings where the image tokens are
+                    sample = split_sample[0]
+                    for i, image_prompt_string in enumerate(image_prompt_strings):
+                        sample += image_prompt_string + split_sample[i + 1]
+                    prompt_strings.append(sample)
+
+                text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+                self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+                inputs.update(text_inputs)
+
+        elif text is not None:
+            if any(n_images_in_text):
+                raise ValueError(
+                    f"Found {sum(n_images_in_text)} {self.image_token} tokens in the text but no images were passed."
+                )
+            text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            inputs.update(text_inputs)
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(array_ids)
+            for i, seq_lengths in enumerate(batch_image_seq_lengths):
+                image_start_positions = np.where(array_ids[i] == self.fake_image_token_id)[0]
+                j = 0
+                for seq_len in seq_lengths:
+                    if j >= len(image_start_positions):
+                        break
+                    start = image_start_positions[j]
+                    end = start + seq_len
+                    mm_token_type_ids[i, start:end] = 1
+                    j = np.searchsorted(image_start_positions, end)
+
+            inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data=inputs, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = Idefics3ProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            num_image_row_cols = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+
+            base_image_length = self.image_seq_len + 3
+            col_length = self.image_seq_len + 2
+            num_image_tokens = []
+            num_image_patches = []
+
+            for num_patches, num_rows, num_cols in num_image_row_cols:
+                row_length = col_length * num_cols + 1
+                num_image_tokens.append(base_image_length + (row_length * num_rows))
+                num_image_patches.append(num_patches)
+
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+
+__all__ = ["Idefics3Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/__init__.py b/phivenv/Lib/site-packages/transformers/models/ijepa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8923af1de116219405577646ae2dcedee5602ccc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ijepa/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ijepa import *
+    from .modeling_ijepa import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06ecf961fd7dc29a7291f68d2a76c9fd41147273
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/configuration_ijepa.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/configuration_ijepa.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a73c6090557ae5a7de76366726d2ada56d933ebe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/configuration_ijepa.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/modeling_ijepa.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/modeling_ijepa.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..092fab3c96968aec6e9f8f1eb8eab219779a87b6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/modeling_ijepa.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/modular_ijepa.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/modular_ijepa.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93025f268b3b4fb06789a42ceb82b2e0dab72ef1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/ijepa/__pycache__/modular_ijepa.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/configuration_ijepa.py b/phivenv/Lib/site-packages/transformers/models/ijepa/configuration_ijepa.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f528adad0d55711f40ea21f58e1e0196822f449
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ijepa/configuration_ijepa.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""I-JEPA model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+
+
+class IJepaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IJepaModel`]. It is used to instantiate an IJEPA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the I-JEPA
+    [facebook/ijepa_vith14_1k](https://huggingface.co/facebook/ijepa_vith14_1k) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
+
+    Example:
+
+    ```python
+    >>> from transformers import IJepaConfig, IJepaModel
+
+    >>> # Initializing a IJEPA ijepa-base-patch16-224 style configuration
+    >>> configuration = IJepaConfig()
+
+    >>> # Initializing a model (with random weights) from the ijepa-base-patch16-224 style configuration
+    >>> model = IJepaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "ijepa"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        pooler_output_size=None,
+        pooler_act="tanh",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
+
+
+__all__ = ["IJepaConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/modeling_ijepa.py b/phivenv/Lib/site-packages/transformers/models/ijepa/modeling_ijepa.py
new file mode 100644
index 0000000000000000000000000000000000000000..cee6e29f8f9bb30aedf7b3b7e10d3a9d8102b563
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ijepa/modeling_ijepa.py
@@ -0,0 +1,540 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/ijepa/modular_ijepa.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_ijepa.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import collections.abc
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, torch_int
+from ...utils.generic import can_return_tuple, check_model_inputs
+from .configuration_ijepa import IJepaConfig
+
+
+class IJepaPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class IJepaEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = IJepaPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embeddings.shape[1]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        patch_pos_embed = self.position_embeddings
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return patch_pos_embed
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class IJepaSelfAttention(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size = hidden_states.shape[0]
+        new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
+
+        key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
+        query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class IJepaSelfOutput(nn.Module):
+    """
+    The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class IJepaAttention(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.attention = IJepaSelfAttention(config)
+        self.output = IJepaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self_attn_output, _ = self.attention(hidden_states, head_mask)
+        output = self.output(self_attn_output, hidden_states)
+        return output
+
+
+class IJepaIntermediate(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class IJepaOutput(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class IJepaLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = IJepaAttention(config)
+        self.intermediate = IJepaIntermediate(config)
+        self.output = IJepaOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states_norm = self.layernorm_before(hidden_states)
+        attention_output = self.attention(hidden_states_norm, head_mask)
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in IJepa, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        return layer_output
+
+
+@auto_docstring
+class IJepaPreTrainedModel(PreTrainedModel):
+    config: IJepaConfig
+    base_model_prefix = "ijepa"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": IJepaLayer,
+        "attentions": IJepaSelfAttention,
+    }
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, IJepaEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
+
+class IJepaEncoder(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([IJepaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> BaseModelOutput:
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(hidden_states, layer_head_mask)
+
+        return BaseModelOutput(last_hidden_state=hidden_states)
+
+
+class IJepaPooler(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class IJepaModel(IJepaPreTrainedModel):
+    def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        """
+        super().__init__(config)
+        self.config = config
+        self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = IJepaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = IJepaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> IJepaPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs: BaseModelOutput = self.encoder(embedding_output, head_mask=head_mask)
+
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output)
+
+
+@auto_docstring(
+    custom_intro="""
+    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
+    e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """
+)
+class IJepaForImageClassification(IJepaPreTrainedModel):
+    def __init__(self, config: IJepaConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.ijepa = IJepaModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.ijepa(
+            pixel_values,
+            head_mask=head_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+        sequence_output = outputs.last_hidden_state
+        logits = self.classifier(sequence_output.mean(dim=1))
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["IJepaPreTrainedModel", "IJepaModel", "IJepaForImageClassification"]
diff --git a/phivenv/Lib/site-packages/transformers/models/ijepa/modular_ijepa.py b/phivenv/Lib/site-packages/transformers/models/ijepa/modular_ijepa.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b8e6e152f3c115db4a1712895239467ea45e7df
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/ijepa/modular_ijepa.py
@@ -0,0 +1,186 @@
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from transformers.models.ijepa.configuration_ijepa import IJepaConfig
+
+from ...modeling_outputs import BaseModelOutputWithPooling, ImageClassifierOutput
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, torch_int
+from ..vit.modeling_vit import ViTEmbeddings, ViTForImageClassification, ViTModel, ViTPreTrainedModel
+
+
+class IJepaEmbeddings(ViTEmbeddings):
+    def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None:
+        super().__init__(config, use_mask_token)
+        # Remove cls_token from IJepaEmbeddings, as it is not used in the model
+        del self.cls_token
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size))
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embeddings.shape[1]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        patch_pos_embed = self.position_embeddings
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return patch_pos_embed
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+@auto_docstring
+class IJepaPreTrainedModel(ViTPreTrainedModel):
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, IJepaEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
+
+class IJepaModel(IJepaPreTrainedModel, ViTModel):
+    def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        """
+        super().__init__(config)
+        self.config = config
+        self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token)
+
+
+@auto_docstring(
+    custom_intro="""
+    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
+    e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """
+)
+class IJepaForImageClassification(IJepaPreTrainedModel, ViTForImageClassification):
+    def __init__(self, config: IJepaConfig):
+        super().__init__(config)
+        self.ijepa = IJepaModel(config, add_pooling_layer=False)
+        self.post_init()
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.ijepa(
+            pixel_values,
+            head_mask=head_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+        sequence_output = outputs.last_hidden_state
+        logits = self.classifier(sequence_output.mean(dim=1))
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "IJepaPreTrainedModel",
+    "IJepaModel",
+    "IJepaForImageClassification",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/__init__.py b/phivenv/Lib/site-packages/transformers/models/imagegpt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb79cea50d6e4ba0633886027cf826248e485aaa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/imagegpt/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_imagegpt import *
+    from .feature_extraction_imagegpt import *
+    from .image_processing_imagegpt import *
+    from .modeling_imagegpt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dbc2d57193b476a3fd076b7db95402925a1e97e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/configuration_imagegpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/configuration_imagegpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a636051002212583ba488d9b4780f4ab8cffa170
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/configuration_imagegpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/feature_extraction_imagegpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/feature_extraction_imagegpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c358bd6c22a3d6d62027a7efd8009e5b0f007d72
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/feature_extraction_imagegpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23a84e5abd37a91cc65d2b259fd3455d2d96a336
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/modeling_imagegpt.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/modeling_imagegpt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a13fdc34ffecc09a35b5f565ab86686f7c56aa17
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/imagegpt/__pycache__/modeling_imagegpt.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/configuration_imagegpt.py b/phivenv/Lib/site-packages/transformers/models/imagegpt/configuration_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cfa8d5e47826e4323f74b33430fa872005c88ea
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/imagegpt/configuration_imagegpt.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OpenAI ImageGPT configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ... import FeatureExtractionMixin, TensorType
+
+logger = logging.get_logger(__name__)
+
+
+class ImageGPTConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`ImageGPTModel`] or a [`TFImageGPTModel`]. It is
+    used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the ImageGPT
+    [openai/imagegpt-small](https://huggingface.co/openai/imagegpt-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 512):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ImageGPTModel`] or [`TFImageGPTModel`].
+        n_positions (`int`, *optional*, defaults to 32*32):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 512):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"quick_gelu"`):
+            Activation function (can be one of the activation functions defined in src/transformers/activations.py).
+            Defaults to "quick_gelu".
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+
+    Example:
+
+    ```python
+    >>> from transformers import ImageGPTConfig, ImageGPTModel
+
+    >>> # Initializing a ImageGPT configuration
+    >>> configuration = ImageGPTConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = ImageGPTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "imagegpt"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=512 + 1,  # add one for start of sentence (sos) token
+        n_positions=32 * 32,
+        n_embd=512,
+        n_layer=24,
+        n_head=8,
+        n_inner=None,
+        activation_function="quick_gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        tie_word_embeddings=False,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.tie_word_embeddings = tie_word_embeddings
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class ImageGPTOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    def generate_dummy_inputs(
+        self,
+        preprocessor: "FeatureExtractionMixin",
+        batch_size: int = 1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        num_channels: int = 3,
+        image_width: int = 32,
+        image_height: int = 32,
+    ) -> Mapping[str, Any]:
+        """
+        Generate inputs to provide to the ONNX exporter for the specific framework
+
+        Args:
+            preprocessor ([`PreTrainedTokenizerBase`] or [`FeatureExtractionMixin`]):
+                The preprocessor associated with this model configuration.
+            batch_size (`int`, *optional*, defaults to -1):
+                The batch size to export the model for (-1 means dynamic axis).
+            num_choices (`int`, *optional*, defaults to -1):
+                The number of candidate answers provided for multiple choice task (-1 means dynamic axis).
+            seq_length (`int`, *optional*, defaults to -1):
+                The sequence length to export the model for (-1 means dynamic axis).
+            is_pair (`bool`, *optional*, defaults to `False`):
+                Indicate if the input is a pair (sentence 1, sentence 2)
+            framework (`TensorType`, *optional*, defaults to `None`):
+                The framework (PyTorch or TensorFlow) that the tokenizer will generate tensors for.
+            num_channels (`int`, *optional*, defaults to 3):
+                The number of channels of the generated images.
+            image_width (`int`, *optional*, defaults to 40):
+                The width of the generated images.
+            image_height (`int`, *optional*, defaults to 40):
+                The height of the generated images.
+
+        Returns:
+            Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
+        """
+
+        input_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
+        inputs = dict(preprocessor(images=input_image, return_tensors=framework))
+
+        return inputs
+
+
+__all__ = ["ImageGPTConfig", "ImageGPTOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py b/phivenv/Lib/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..46787f139f10a0c339e4cea9524d34c00a03ceb6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for ImageGPT."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_imagegpt import ImageGPTImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class ImageGPTFeatureExtractor(ImageGPTImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class ImageGPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use ImageGPTImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["ImageGPTFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py b/phivenv/Lib/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d3e207dd3eb7d52f5671c4094aee4e615404647
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -0,0 +1,304 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ImageGPT."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...utils.import_utils import requires
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+def squared_euclidean_distance(a, b):
+    b = b.T
+    a2 = np.sum(np.square(a), axis=1)
+    b2 = np.sum(np.square(b), axis=0)
+    ab = np.matmul(a, b)
+    d = a2[:, None] - 2 * ab + b2[None, :]
+    return d
+
+
+def color_quantize(x, clusters):
+    x = x.reshape(-1, 3)
+    d = squared_euclidean_distance(x, clusters)
+    return np.argmin(d, axis=1)
+
+
+@requires(backends=("vision",))
+class ImageGPTImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ImageGPT image processor. This image processor can be used to resize images to a smaller resolution
+    (such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel values"
+    (color clusters).
+
+    Args:
+        clusters (`np.ndarray` or `list[list[int]]`, *optional*):
+            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+            in `preprocess`.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
+            `do_resize` in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in
+            `preprocess`.
+        do_color_quantize (`bool`, *optional*, defaults to `True`):
+            Whether to color quantize the image. Can be overridden by `do_color_quantize` in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
+        clusters: Optional[Union[list[list[int]], np.ndarray]] = None,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_normalize: bool = True,
+        do_color_quantize: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 256, "width": 256}
+        size = get_size_dict(size)
+        self.clusters = np.array(clusters) if clusters is not None else None
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_normalize = do_normalize
+        self.do_color_quantize = do_color_quantize
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Normalizes an images' pixel values to between [-1, 1].
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        image = rescale(image=image, scale=1 / 127.5, data_format=data_format, input_data_format=input_data_format)
+        image = image - 1
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_normalize: Optional[bool] = None,
+        do_color_quantize: Optional[bool] = None,
+        clusters: Optional[Union[list[list[int]], np.ndarray]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_normalize=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image
+            do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`):
+                Whether to color quantize the image.
+            clusters (`np.ndarray` or `list[list[int]]`, *optional*, defaults to `self.clusters`):
+                Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if
+                `do_color_quantize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                Only has an effect if `do_color_quantize` is set to `False`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
+        clusters = clusters if clusters is not None else self.clusters
+        clusters = np.array(clusters)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # Here, normalize() is using a constant factor to divide pixel values.
+        # hence, the method does not need iamge_mean and image_std.
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_color_quantize and clusters is None:
+            raise ValueError("Clusters must be specified if do_color_quantize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_normalize and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If you wish to do this, "
+                "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].",
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
+
+        if do_color_quantize:
+            images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
+            # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
+            images = np.array(images)
+            images = color_quantize(images, clusters).reshape(images.shape[:-1])
+
+            # flatten to (batch_size, height*width)
+            batch_size = images.shape[0]
+            images = images.reshape(batch_size, -1)
+
+            # We need to convert back to a list of images to keep consistent behaviour across processors.
+            images = list(images)
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+
+        data = {"input_ids": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["ImageGPTImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/imagegpt/modeling_imagegpt.py b/phivenv/Lib/site-packages/transformers/models/imagegpt/modeling_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..23fe9b1e194d8bbcbea0324fb0e4a53310849d87
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/imagegpt/modeling_imagegpt.py
@@ -0,0 +1,1043 @@
+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI ImageGPT model."""
+
+import math
+import os
+from typing import Any, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...utils import (
+    auto_docstring,
+    logging,
+    torch_float,
+)
+from .configuration_imagegpt import ImageGPTConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path):
+    """
+    Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(imagegpt_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split("/")
+
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ) or name[-1] in ["_step"]:
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+
+        pointer = model
+        if name[-1] not in ["wtet"]:
+            pointer = getattr(pointer, "transformer")
+
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] in ["q_proj", "k_proj", "v_proj"]:
+                pointer = getattr(pointer, "c_attn")
+                pointer = getattr(pointer, "weight")
+            elif len(name) == 3 and name[1] == "attn" and scope_names[0] == "c_proj":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "wtet":
+                pointer = getattr(pointer, "lm_head")
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "sos":
+                pointer = getattr(pointer, "wte")
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+
+        if len(name) > 1 and name[1] == "attn" or name[-1] == "wtet" or name[-1] == "sos" or name[-1] == "wte":
+            pass  # array is used to initialize only part of the pointer so sizes won't match
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+
+        logger.info(f"Initialize PyTorch weight {name}")
+
+        if name[-1] == "q_proj":
+            pointer.data[:, : config.n_embd] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T
+        elif name[-1] == "k_proj":
+            pointer.data[:, config.n_embd : 2 * config.n_embd] = torch.from_numpy(
+                array.reshape(config.n_embd, config.n_embd)
+            ).T
+        elif name[-1] == "v_proj":
+            pointer.data[:, 2 * config.n_embd :] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T
+        elif len(name) == 3 and name[1] == "attn" and name[2] == "c_proj":
+            pointer.data = torch.from_numpy(array.reshape(config.n_embd, config.n_embd))
+        elif name[-1] == "wtet":
+            pointer.data = torch.from_numpy(array)
+        elif name[-1] == "wte":
+            pointer.data[: config.vocab_size - 1, :] = torch.from_numpy(array)
+        elif name[-1] == "sos":
+            pointer.data[-1] = torch.from_numpy(array)
+        else:
+            pointer.data = torch.from_numpy(array)
+
+    return model
+
+
+class ImageGPTLayerNorm(nn.Module):
+    def __init__(self, hidden_size: tuple[int], eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.Tensor(hidden_size))
+
+    def forward(self, tensor: torch.Tensor) -> torch.Tensor:
+        # input is not mean centered
+        tensor = tensor / torch.sqrt(torch.mean(torch.square(tensor), axis=-1, keepdim=True) + self.eps)
+        tensor = tensor * self.weight
+        return tensor
+
+
+class ImageGPTAttention(nn.Module):
+    def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
+        super().__init__()
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch_float(value.size(-1) ** 0.5)
+
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with torch.autocast(query.device.type, enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(*new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        is_cross_attention = encoder_hidden_states is not None
+        bsz, seq_len, _ = hidden_states.shape
+
+        if layer_past is not None:
+            if isinstance(layer_past, EncoderDecoderCache):
+                is_updated = layer_past.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = layer_past.cross_attention_cache
+                else:
+                    curr_past_key_value = layer_past.self_attention_cache
+            else:
+                curr_past_key_value = layer_past
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`."
+                )
+
+            if layer_past is not None and is_updated:
+                # reuse k,v, cross_attentions, and compute only q
+                query = self.q_attn(hidden_states)
+                key = curr_past_key_value.layers[self.layer_idx].keys
+                value = curr_past_key_value.layers[self.layer_idx].values
+            else:
+                query = self.q_attn(hidden_states)
+                key, value = self.c_attn(current_states).split(self.split_size, dim=2)
+                key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+                value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        else:
+            query, key, value = self.c_attn(current_states).split(self.split_size, dim=2)
+            key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+
+        if layer_past is not None:
+            # save all key/value_states to cache to be re-used for fast auto-regressive generation
+            cache_position = cache_position if not is_cross_attention else None
+            key, value = curr_past_key_value.update(key, value, self.layer_idx, {"cache_position": cache_position})
+            # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+            if is_cross_attention:
+                layer_past.is_updated[self.layer_idx] = True
+
+        query = query.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        if self.reorder_and_upcast_attn:
+            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights
+
+
+class ImageGPTMLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class ImageGPTBlock(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = ImageGPTAttention(config, layer_idx=layer_idx)
+        self.ln_2 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            self.crossattention = ImageGPTAttention(config, is_cross_attention=True, layer_idx=layer_idx)
+            self.ln_cross_attn = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = ImageGPTMLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[1:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        return (hidden_states,) + outputs
+
+
+@auto_docstring
+class ImageGPTPreTrainedModel(PreTrainedModel):
+    config: ImageGPTConfig
+    load_tf_weights = load_tf_weights_in_imagegpt
+    base_model_prefix = "transformer"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ImageGPTBlock"]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, ImageGPTLayerNorm):
+            module.weight.data.fill_(1.0)
+
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if "c_proj" in name and "weight" in name:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+
+@auto_docstring
+class ImageGPTModel(ImageGPTPreTrainedModel):
+    def __init__(self, config: ImageGPTConfig):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([ImageGPTBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = ImageGPTLayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ImageGPTModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_length = past_key_values.get_seq_length() if past_key_values is not None else past_key_values
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
+
+        # ImageGPTAttention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                past_key_values,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[2],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(*output_shape)
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: ImageGPTConfig):
+        super().__init__(config)
+        self.transformer = ImageGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size - 1, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
+        >>> import torch
+        >>> import matplotlib.pyplot as plt
+        >>> import numpy as np
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> # unconditional generation of 8 images
+        >>> batch_size = 4
+        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
+        >>> context = context.to(device)
+        >>> output = model.generate(
+        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
+        ... )
+
+        >>> clusters = image_processor.clusters
+        >>> height = image_processor.size["height"]
+        >>> width = image_processor.size["width"]
+
+        >>> samples = output[:, 1:].detach().cpu().numpy()
+        >>> samples_img = [
+        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
+        ... ]  # convert color cluster tokens back to pixels
+        >>> f, axes = plt.subplots(1, batch_size, dpi=300)
+
+        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
+        ...     ax.axis("off")
+        ...     ax.imshow(img)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The ImageGPT Model transformer with an image classification head on top (linear layer).
+    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
+    """
+)
+class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
+    def __init__(self, config: ImageGPTConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = ImageGPTModel(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Any,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        # average-pool the hidden states along the sequence dimension
+        pooled_hidden_states = hidden_states.mean(dim=1)
+        # project from (batch_size, hidden_size) to (batch_size, num_labels)
+        logits = self.score(pooled_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "ImageGPTForCausalImageModeling",
+    "ImageGPTForImageClassification",
+    "ImageGPTModel",
+    "ImageGPTPreTrainedModel",
+    "load_tf_weights_in_imagegpt",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/__init__.py b/phivenv/Lib/site-packages/transformers/models/informer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd7a901eab235ecd14d2127a9eb78022c720c0f3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/informer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_informer import *
+    from .modeling_informer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31cfaf62a2a9317cf0ba2dd1857f14347a8ce4c1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b33f4dcc042503e13e5f76b6fa6757b10a6fe67f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/modeling_informer.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/modeling_informer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..486462d6bea353be95ba7b900987b4bbeb413a33
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/modeling_informer.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfe65e0514beabd6a44d6153fc92bf7b8e0f3978
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/configuration_informer.py b/phivenv/Lib/site-packages/transformers/models/informer/configuration_informer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f62417358c82ce0412336f86f2c9d32d1c019de9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/informer/configuration_informer.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Informer model configuration"""
+
+from typing import Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class InformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`InformerModel`]. It is used to instantiate an
+    Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Informer
+    [huggingface/informer-tourism-monthly](https://huggingface.co/huggingface/informer-tourism-monthly) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model. This value is
+            typically dictated by the dataset and we recommend to set it appropriately.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If `None`, the context length will be the same as the
+            `prediction_length`.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency of the data. Default is
+            `[1, 2, 3, 4, 5, 6, 7]` but we recommend to change it based on the dataset appropriately.
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+        attention_type (`str`, *optional*, defaults to "prob"):
+            Attention used in encoder. This can be set to "prob" (Informer's ProbAttention) or "full" (vanilla
+            transformer's canonical self-attention).
+        sampling_factor (`int`, *optional*, defaults to 5):
+            ProbSparse sampling factor (only makes affect when `attention_type`="prob"). It is used to control the
+            reduced query matrix (Q_reduce) input length.
+        distil (`bool`, *optional*, defaults to `True`):
+            Whether to use distilling in encoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import InformerConfig, InformerModel
+
+    >>> # Initializing an Informer configuration with 12 time steps for prediction
+    >>> configuration = InformerConfig(prediction_length=12)
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = InformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "informer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+        "initializer_range": "init_std",
+    }
+
+    def __init__(
+        self,
+        prediction_length: Optional[int] = None,
+        context_length: Optional[int] = None,
+        distribution_output: str = "student_t",
+        loss: str = "nll",
+        input_size: int = 1,
+        lags_sequence: Optional[list[int]] = None,
+        scaling: Optional[Union[str, bool]] = "mean",
+        num_dynamic_real_features: int = 0,
+        num_static_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_time_features: int = 0,
+        cardinality: Optional[list[int]] = None,
+        embedding_dimension: Optional[list[int]] = None,
+        d_model: int = 64,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        is_encoder_decoder: bool = True,
+        activation_function: str = "gelu",
+        dropout: float = 0.05,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        num_parallel_samples: int = 100,
+        init_std: float = 0.02,
+        use_cache=True,
+        # Informer arguments
+        attention_type: str = "prob",
+        sampling_factor: int = 5,
+        distil: bool = True,
+        **kwargs,
+    ):
+        # time series specific configuration
+        self.prediction_length = prediction_length
+        self.context_length = context_length or prediction_length
+        self.distribution_output = distribution_output
+        self.loss = loss
+        self.input_size = input_size
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence if lags_sequence is not None else [1, 2, 3, 4, 5, 6, 7]
+        self.scaling = scaling
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+
+        # set cardinality
+        if cardinality and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
+                raise ValueError(
+                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.cardinality = cardinality
+        else:
+            self.cardinality = [0]
+
+        # set embedding_dimension
+        if embedding_dimension and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
+                raise ValueError(
+                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.embedding_dimension = embedding_dimension
+        else:
+            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+
+        self.num_parallel_samples = num_parallel_samples
+
+        # Transformer architecture configuration
+        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features
+        self.d_model = d_model
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+
+        self.activation_function = activation_function
+        self.init_std = init_std
+
+        self.use_cache = use_cache
+
+        # Informer
+        self.attention_type = attention_type
+        self.sampling_factor = sampling_factor
+        self.distil = distil
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_dynamic_real_features
+            + self.num_time_features
+            + self.num_static_real_features
+            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
+        )
+
+
+__all__ = ["InformerConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/modeling_informer.py b/phivenv/Lib/site-packages/transformers/models/informer/modeling_informer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e94a5305fbb4631aca03ffd7532dd6fcb3eb94e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/informer/modeling_informer.py
@@ -0,0 +1,2160 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/informer/modular_informer.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_informer.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    SampleTSPredictionOutput,
+    Seq2SeqTSModelOutput,
+    Seq2SeqTSPredictionOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_informer import InformerConfig
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+class InformerFeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
+
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
+
+    def __init__(self, cardinalities: list[int], embedding_dims: list[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+class InformerStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+class InformerMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
+    accordingly.
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+class InformerNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+class InformerSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+
+    def _init_weight(self):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = self.weight.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        self.weight = nn.Parameter(out, requires_grad=False)
+
+    @torch.no_grad()
+    def forward(
+        self, input_ids_shape: torch.Size, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        if position_ids is None:
+            bsz, seq_len = input_ids_shape[:2]
+            position_ids = torch.arange(
+                past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+            )
+        return super().forward(position_ids)
+
+
+class InformerValueEmbedding(nn.Module):
+    def __init__(self, feature_size, d_model):
+        super().__init__()
+        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
+
+    def forward(self, x):
+        return self.value_projection(x)
+
+
+@auto_docstring
+class InformerPreTrainedModel(PreTrainedModel):
+    config: InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: nn.Module):
+        super()._init_weights(module)
+        if isinstance(module, InformerSinusoidalPositionalEmbedding):
+            module._init_weight()
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+    # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+        past_key_values_length: int,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self.config._attn_implementation == "sdpa":
+            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                input_shape,
+                inputs_embeds,
+                past_key_values_length,
+            )
+        elif self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            # Other attention flavors support in-built causal (when `mask is None`)
+            # while we need to create our specific block mask regardless
+            elif attention_mask is None:
+                attention_mask = make_flex_block_causal_mask(
+                    torch.ones(
+                        size=(input_shape),
+                        device=inputs_embeds.device,
+                    )
+                )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+
+        return attention_mask
+
+    # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class InformerAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[InformerConfig] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2)
+
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(*kv_input_shape).transpose(1, 2)
+            value_states = value_states.view(*kv_input_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class InformerProbSparseAttention(nn.Module):
+    """Probabilistic Attention mechanism to select the "active"
+    queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
+    memory requirements of vanilla attention"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        sampling_factor: int = 5,
+        bias: bool = True,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.factor = sampling_factor
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(*kv_input_shape).transpose(1, 2)
+            value_states = value_states.view(*kv_input_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        key_states_time_length = key_states.size(1)  # L_K
+        log_key_states_time_length = np.ceil(np.log1p(key_states_time_length)).astype("int").item()  # log_L_K
+
+        query_states_time_length = query_states.size(1)  # L_Q
+        log_query_states_time_length = np.ceil(np.log1p(query_states_time_length)).astype("int").item()  # log_L_Q
+
+        u_part = min(self.factor * query_states_time_length * log_key_states_time_length, key_states_time_length)
+        u = min(self.factor * log_query_states_time_length, query_states_time_length)
+
+        if key_states_time_length > 0:
+            index_sample = torch.randint(0, key_states_time_length, (u_part,))
+            k_sample = key_states[:, index_sample, :]
+        else:
+            k_sample = key_states
+
+        queries_keys_sample = torch.bmm(query_states, k_sample.transpose(1, 2))  # Q_K_sampled
+
+        # find the Top_k query with sparsity measurement
+        if u > 0:
+            sparsity_measurement = queries_keys_sample.max(dim=-1)[0] - torch.div(
+                queries_keys_sample.sum(dim=-1), key_states_time_length
+            )  # M
+            top_u_sparsity_measurement = sparsity_measurement.topk(u, sorted=False)[1]  # M_top
+
+            # calculate q_reduce: query_states[:, top_u_sparsity_measurement]
+            dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
+            q_reduce = query_states[dim_for_slice, top_u_sparsity_measurement]
+        else:
+            q_reduce = query_states
+            top_u_sparsity_measurement = None
+
+        # Use q_reduce to calculate attention weights
+        attn_weights = torch.bmm(q_reduce, key_states.transpose(1, 2))
+
+        src_len = key_states.size(1)
+        if attn_weights.size() != (bsz * self.num_heads, u, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, u, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            prob_mask = attention_mask.expand(bsz, self.num_heads, tgt_len, src_len).reshape(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+
+            if top_u_sparsity_measurement is not None:
+                dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
+                prob_mask = prob_mask[dim_for_slice, top_u_sparsity_measurement, :]
+
+            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
+                bsz, self.num_heads, u, src_len
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, u, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        # calculate context for updating the attn_output, based on:
+        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L74
+        if self.is_decoder:
+            # cast to float32 before operation to avoid overflow
+            context = value_states.cumsum(dim=-2, dtype=torch.float32).to(value_states.dtype)
+        else:
+            v_mean_dim_time = value_states.mean(dim=-2)
+            context = (
+                v_mean_dim_time.unsqueeze(dim=1)
+                .expand(bsz * self.num_heads, query_states_time_length, v_mean_dim_time.size(-1))
+                .clone()
+            )
+
+        if top_u_sparsity_measurement is not None:
+            # update context: copy the attention output to the context at top_u_sparsity_measurement index
+            dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
+            context[dim_for_slice, top_u_sparsity_measurement, :] = attn_output
+            attn_output = context
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
+class InformerConvLayer(GradientCheckpointingLayer):
+    def __init__(self, c_in):
+        super().__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
+        )
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class InformerEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: InformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        if config.attention_type == "prob":
+            self.self_attn = InformerProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                sampling_factor=config.sampling_factor,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class InformerDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: InformerConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = InformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        if config.attention_type == "prob":
+            self.self_attn = InformerProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                sampling_factor=config.sampling_factor,
+                is_decoder=True,
+                layer_idx=layer_idx,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+                config=config,
+                layer_idx=layer_idx,
+            )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class InformerEncoder(InformerPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`InformerEncoderLayer`].
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+
+        if config.distil:
+            self.conv_layers = nn.ModuleList(
+                [InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)]
+            )
+            self.conv_layers.append(None)
+        else:
+            self.conv_layers = [None] * config.encoder_layers
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size())
+
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, (encoder_layer, conv_layer) in enumerate(zip(self.layers, self.conv_layers)):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                )
+                if conv_layer is not None:
+                    output = conv_layer(layer_outputs[0])
+                    layer_outputs = (output,) + layer_outputs[1:]
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class InformerDecoder(InformerPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    [`InformerDecoderLayer`]
+
+    Args:
+        config: InformerConfig
+    """
+
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([InformerDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = inputs_embeds.size()[:-1]
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + input_shape[1], device=inputs_embeds.device
+            )
+
+        attention_mask = self._update_causal_mask(
+            attention_mask,
+            input_shape,
+            inputs_embeds,
+            past_key_values_length,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            input_shape,
+            inputs_embeds,
+        )
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size(), past_key_values_length=self.config.context_length)
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring
+class InformerModel(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        if config.scaling == "mean" or config.scaling is True:
+            self.scaler = InformerMeanScaler(config)
+        elif config.scaling == "std":
+            self.scaler = InformerStdScaler(config)
+        else:
+            self.scaler = InformerNOPScaler(config)
+
+        if config.num_static_categorical_features > 0:
+            self.embedder = InformerFeatureEmbedder(
+                cardinalities=config.cardinality,
+                embedding_dims=config.embedding_dimension,
+            )
+
+        # transformer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def _past_length(self) -> int:
+        return self.config.context_length + max(self.config.lags_sequence)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
+            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
+            j, :, k] = sequence[i, -indices[k]-S+j, :].
+
+        Args:
+            sequence: Tensor
+                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
+            subsequences_length : int
+                Length of the subsequences to be extracted.
+            shift: int
+                Shift the lags by this amount back.
+        """
+        sequence_length = sequence.shape[1]
+        indices = [lag - shift for lag in self.config.lags_sequence]
+
+        if max(indices) + subsequences_length > sequence_length:
+            raise ValueError(
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+        return torch.stack(lagged_values, dim=-1)
+
+    def create_network_inputs(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+    ):
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
+                    future_time_features,
+                ),
+                dim=1,
+            )
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+        )
+
+        # target
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
+        _, loc, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            (torch.cat((past_values, future_values), dim=1) - loc) / scale
+            if future_values is not None
+            else (past_values - loc) / scale
+        )
+
+        # static features
+        if loc.ndim == 3:
+            squeezed_loc = loc.squeeze(1)
+            squeezed_scale = scale.squeeze(1)
+        else:
+            squeezed_loc = loc
+            squeezed_scale = scale
+        log_abs_loc = squeezed_loc.abs().log1p()
+        log_scale = squeezed_scale.log()
+        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
+
+        if static_real_features is not None:
+            static_feat = torch.cat((static_real_features, static_feat), dim=1)
+        if static_categorical_features is not None:
+            embedded_cat = self.embedder(static_categorical_features)
+            static_feat = torch.cat((embedded_cat, static_feat), dim=1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
+
+        # all features
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        # lagged features
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
+            raise ValueError(
+                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
+            )
+
+        # transformer inputs
+        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
+
+        return transformer_inputs, loc, scale, static_feat
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[list[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Seq2SeqTSModelOutput, tuple]:
+        r"""
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
+
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
+
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step. Holiday features are also a good example of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
+
+            See the demo notebook and code snippets for details.
+
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerModel.from_pretrained("huggingface/informer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_inputs, loc, scale, static_feat = self.create_network_inputs(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+        )
+
+        if encoder_outputs is None:
+            enc_input = transformer_inputs[:, : self.config.context_length, ...]
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Avoid empty tensors and instead create a zeroes tensor which
+        # will be treated the same in torch, i.e. matmul with empty == all 0s
+        if self.config.context_length >= transformer_inputs.shape[1]:
+            bsz, _, dim = transformer_inputs.shape
+            dec_input = torch.zeros(
+                size=(bsz, 1, dim), device=transformer_inputs.device, dtype=transformer_inputs.dtype
+            )
+        else:
+            dec_input = transformer_inputs[:, self.config.context_length :, ...]
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=dec_input,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
+
+        return Seq2SeqTSModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            loc=loc,
+            scale=scale,
+            static_features=static_feat,
+        )
+
+
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
+@auto_docstring
+class InformerForPrediction(InformerPreTrainedModel):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        self.model = InformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = nll
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    def output_params(self, dec_output):
+        return self.parameter_projection(dec_output)
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @torch.jit.ignore
+    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
+
+    @auto_docstring
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[list[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Seq2SeqTSModelOutput, tuple]:
+        r"""
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
+
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
+
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step. Holiday features are also a good example of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
+
+            See the demo notebook and code snippets for details.
+
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerForPrediction.from_pretrained(
+        ...     "huggingface/informer-tourism-monthly"
+        ... )
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_values is not None:
+            use_cache = False
+
+        outputs = self.model(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        prediction_loss = None
+        params = None
+        if future_values is not None:
+            params = self.output_params(outputs[0])  # outputs.last_hidden_state
+            # loc is 3rd last and scale is 2nd last output
+            distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
+
+            loss = self.loss(distribution, future_values)
+
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
+
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_mask
+            else:
+                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
+
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+        if not return_dict:
+            outputs = ((params,) + outputs[1:]) if params is not None else outputs[1:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
+
+        return Seq2SeqTSPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            loc=outputs.loc,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        future_time_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SampleTSPredictionOutput:
+        r"""
+        Greedily generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+                Past values of the time series, that serve as context in order to predict the future. The sequence size
+                of this tensor must be larger than the `context_length` of the model, since the model will use the
+                larger size to construct lag features, i.e. additional values from the past which are added in order to
+                serve as "extra context".
+
+                The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if
+                no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+                look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length
+                of the past.
+
+                The `past_values` is what the Transformer encoder gets as input (with optional additional features,
+                such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+                Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+                For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number
+                of variates in the time series per time step.
+            past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+                Required time features, which the model internally will add to `past_values`. These could be things
+                like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features).
+                These could also be so-called "age" features, which basically help the model know "at which point in
+                life" a time-series is. Age features have small values for distant past time steps and increase
+                monotonically the more we approach the current time step. Holiday features are also a good example of
+                time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+                Required time features for the prediction window, which the model internally will add to sampled
+                predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors
+                (for instance as Fourier features). These could also be so-called "age" features, which basically help
+                the model know "at which point in life" a time-series is. Age features have small values for distant
+                past time steps and increase monotonically the more we approach the current time step. Holiday features
+                are also a good example of time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+                Optional static categorical features for which the model will learn an embedding, which it will add to
+                the values of the time series.
+
+                Static categorical features are features which have the same value for all time steps (static over
+                time).
+
+                A typical example of a static categorical feature is a time series ID.
+            static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+                Optional static real features which the model will add to the values of the time series.
+
+                Static real features are features which have the same value for all time steps (static over time).
+
+                A typical example of a static real feature is promotion information.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+
+        Return:
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for
+            multivariate predictions.
+        """
+        outputs = self(
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=future_time_features,
+            future_values=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            use_cache=True,
+        )
+
+        decoder = self.model.get_decoder()
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        loc = outputs.loc
+        scale = outputs.scale
+        static_feat = outputs.static_features
+
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_past_values = (
+            past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc
+        ) / repeated_scale
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, future_time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, future_time_features), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        future_samples = []
+
+        # greedy decoding
+        for k in range(self.config.prediction_length):
+            lagged_sequence = self.model.get_lagged_subsequences(
+                sequence=repeated_past_values,
+                subsequences_length=1 + k,
+                shift=1,
+            )
+
+            lags_shape = lagged_sequence.shape
+            reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+            decoder_input = torch.cat((reshaped_lagged_sequence, repeated_features[:, : k + 1]), dim=-1)
+
+            dec_output = decoder(inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden)
+            dec_last_hidden = dec_output.last_hidden_state
+
+            params = self.parameter_projection(dec_last_hidden[:, -1:])
+            distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale)
+            next_sample = distr.sample()
+
+            repeated_past_values = torch.cat(
+                (repeated_past_values, (next_sample - repeated_loc) / repeated_scale), dim=1
+            )
+            future_samples.append(next_sample)
+
+        concat_future_samples = torch.cat(future_samples, dim=1)
+
+        return SampleTSPredictionOutput(
+            sequences=concat_future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
+        )
+
+
+__all__ = ["InformerForPrediction", "InformerModel", "InformerPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/informer/modular_informer.py b/phivenv/Lib/site-packages/transformers/models/informer/modular_informer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fd28bc5282210820ba83d94120b5add3a143f80
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/informer/modular_informer.py
@@ -0,0 +1,980 @@
+# coding=utf-8
+# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Informer model."""
+
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...cache_utils import EncoderDecoderCache
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...utils import (
+    auto_docstring,
+    is_torch_flex_attn_available,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..bart.modeling_bart import BartAttention
+from ..time_series_transformer.modeling_time_series_transformer import (
+    TimeSeriesFeatureEmbedder,
+    TimeSeriesMeanScaler,
+    TimeSeriesNOPScaler,
+    TimeSeriesSinusoidalPositionalEmbedding,
+    TimeSeriesStdScaler,
+    TimeSeriesTransformerDecoder,
+    TimeSeriesTransformerDecoderLayer,
+    TimeSeriesTransformerEncoder,
+    TimeSeriesTransformerEncoderLayer,
+    TimeSeriesTransformerForPrediction,
+    TimeSeriesTransformerModel,
+    TimeSeriesValueEmbedding,
+)
+from .configuration_informer import InformerConfig
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
+class InformerFeatureEmbedder(TimeSeriesFeatureEmbedder):
+    pass
+
+
+class InformerStdScaler(TimeSeriesStdScaler):
+    pass
+
+
+class InformerMeanScaler(TimeSeriesMeanScaler):
+    pass
+
+
+class InformerNOPScaler(TimeSeriesNOPScaler):
+    pass
+
+
+class InformerSinusoidalPositionalEmbedding(TimeSeriesSinusoidalPositionalEmbedding):
+    pass
+
+
+class InformerValueEmbedding(TimeSeriesValueEmbedding):
+    pass
+
+
+@auto_docstring
+class InformerPreTrainedModel(PreTrainedModel):
+    config: InformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: nn.Module):
+        super()._init_weights(module)
+        if isinstance(module, InformerSinusoidalPositionalEmbedding):
+            module._init_weight()
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+    # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+        past_key_values_length: int,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self.config._attn_implementation == "sdpa":
+            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                input_shape,
+                inputs_embeds,
+                past_key_values_length,
+            )
+        elif self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            # Other attention flavors support in-built causal (when `mask is None`)
+            # while we need to create our specific block mask regardless
+            elif attention_mask is None:
+                attention_mask = make_flex_block_causal_mask(
+                    torch.ones(
+                        size=(input_shape),
+                        device=inputs_embeds.device,
+                    )
+                )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+
+        return attention_mask
+
+    # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+class InformerAttention(BartAttention):
+    pass
+
+
+class InformerProbSparseAttention(nn.Module):
+    """Probabilistic Attention mechanism to select the "active"
+    queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
+    memory requirements of vanilla attention"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        sampling_factor: int = 5,
+        bias: bool = True,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.factor = sampling_factor
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(*kv_input_shape).transpose(1, 2)
+            value_states = value_states.view(*kv_input_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        key_states_time_length = key_states.size(1)  # L_K
+        log_key_states_time_length = np.ceil(np.log1p(key_states_time_length)).astype("int").item()  # log_L_K
+
+        query_states_time_length = query_states.size(1)  # L_Q
+        log_query_states_time_length = np.ceil(np.log1p(query_states_time_length)).astype("int").item()  # log_L_Q
+
+        u_part = min(self.factor * query_states_time_length * log_key_states_time_length, key_states_time_length)
+        u = min(self.factor * log_query_states_time_length, query_states_time_length)
+
+        if key_states_time_length > 0:
+            index_sample = torch.randint(0, key_states_time_length, (u_part,))
+            k_sample = key_states[:, index_sample, :]
+        else:
+            k_sample = key_states
+
+        queries_keys_sample = torch.bmm(query_states, k_sample.transpose(1, 2))  # Q_K_sampled
+
+        # find the Top_k query with sparsity measurement
+        if u > 0:
+            sparsity_measurement = queries_keys_sample.max(dim=-1)[0] - torch.div(
+                queries_keys_sample.sum(dim=-1), key_states_time_length
+            )  # M
+            top_u_sparsity_measurement = sparsity_measurement.topk(u, sorted=False)[1]  # M_top
+
+            # calculate q_reduce: query_states[:, top_u_sparsity_measurement]
+            dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
+            q_reduce = query_states[dim_for_slice, top_u_sparsity_measurement]
+        else:
+            q_reduce = query_states
+            top_u_sparsity_measurement = None
+
+        # Use q_reduce to calculate attention weights
+        attn_weights = torch.bmm(q_reduce, key_states.transpose(1, 2))
+
+        src_len = key_states.size(1)
+        if attn_weights.size() != (bsz * self.num_heads, u, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, u, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            prob_mask = attention_mask.expand(bsz, self.num_heads, tgt_len, src_len).reshape(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+
+            if top_u_sparsity_measurement is not None:
+                dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
+                prob_mask = prob_mask[dim_for_slice, top_u_sparsity_measurement, :]
+
+            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
+                bsz, self.num_heads, u, src_len
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, u, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, u, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        # calculate context for updating the attn_output, based on:
+        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L74
+        if self.is_decoder:
+            # cast to float32 before operation to avoid overflow
+            context = value_states.cumsum(dim=-2, dtype=torch.float32).to(value_states.dtype)
+        else:
+            v_mean_dim_time = value_states.mean(dim=-2)
+            context = (
+                v_mean_dim_time.unsqueeze(dim=1)
+                .expand(bsz * self.num_heads, query_states_time_length, v_mean_dim_time.size(-1))
+                .clone()
+            )
+
+        if top_u_sparsity_measurement is not None:
+            # update context: copy the attention output to the context at top_u_sparsity_measurement index
+            dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
+            context[dim_for_slice, top_u_sparsity_measurement, :] = attn_output
+            attn_output = context
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
+class InformerConvLayer(GradientCheckpointingLayer):
+    def __init__(self, c_in):
+        super().__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=1,
+            padding_mode="circular",
+        )
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class InformerEncoderLayer(TimeSeriesTransformerEncoderLayer):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        del self.self_attn
+
+        if config.attention_type == "prob":
+            self.self_attn = InformerProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                sampling_factor=config.sampling_factor,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+            )
+
+
+class InformerDecoderLayer(TimeSeriesTransformerDecoderLayer):
+    def __init__(self, config: InformerConfig, layer_idx: Optional[int] = None):
+        super().__init__(config)
+
+        del self.self_attn
+
+        if config.attention_type == "prob":
+            self.self_attn = InformerProbSparseAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                sampling_factor=config.sampling_factor,
+                is_decoder=True,
+                layer_idx=layer_idx,
+            )
+        else:
+            self.self_attn = InformerAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+                config=config,
+                layer_idx=layer_idx,
+            )
+
+
+class InformerEncoder(TimeSeriesTransformerEncoder):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.gradient_checkpointing = False
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        if config.distil:
+            self.conv_layers = nn.ModuleList(
+                [InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)]
+            )
+            self.conv_layers.append(None)
+        else:
+            self.conv_layers = [None] * config.encoder_layers
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size())
+
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, (encoder_layer, conv_layer) in enumerate(zip(self.layers, self.conv_layers)):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                )
+                if conv_layer is not None:
+                    output = conv_layer(layer_outputs[0])
+                    layer_outputs = (output,) + layer_outputs[1:]
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class InformerDecoder(TimeSeriesTransformerDecoder):
+    def __init__(self, config: InformerConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = InformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([InformerDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class InformerModel(TimeSeriesTransformerModel):
+    def __init__(self, config: InformerConfig):
+        PreTrainedModel.__init__(self, config)
+
+        if config.scaling == "mean" or config.scaling is True:
+            self.scaler = InformerMeanScaler(config)
+        elif config.scaling == "std":
+            self.scaler = InformerStdScaler(config)
+        else:
+            self.scaler = InformerNOPScaler(config)
+
+        if config.num_static_categorical_features > 0:
+            self.embedder = InformerFeatureEmbedder(
+                cardinalities=config.cardinality,
+                embedding_dims=config.embedding_dimension,
+            )
+
+        # transformer encoder-decoder and mask initializer
+        self.encoder = InformerEncoder(config)
+        self.decoder = InformerDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self, **super_kwargs):
+        r"""
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
+
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
+
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step. Holiday features are also a good example of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
+
+            See the demo notebook and code snippets for details.
+
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerModel.from_pretrained("huggingface/informer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        super().forward(**super_kwargs)
+
+
+class InformerForPrediction(TimeSeriesTransformerForPrediction):
+    def __init__(self, config: InformerConfig):
+        PreTrainedModel.__init__(self, config)
+
+        self.model = InformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = nll
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(self, **super_kwargs):
+        r"""
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
+
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
+
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step. Holiday features are also a good example of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
+
+            See the demo notebook and code snippets for details.
+
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import InformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = InformerForPrediction.from_pretrained(
+        ...     "huggingface/informer-tourism-monthly"
+        ... )
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+        super().forward(**super_kwargs)
+
+
+__all__ = ["InformerForPrediction", "InformerModel", "InformerPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/__init__.py b/phivenv/Lib/site-packages/transformers/models/instructblip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2bd053d2c8b9fdaec4211b2b74e964bb88a5e3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblip/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_instructblip import *
+    from .modeling_instructblip import *
+    from .processing_instructblip import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e069798f6ba2ea479783ff43aa72949293064827
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf8d34ff2c79cd34564b11953fd3da96d3a21e31
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e426fed735a26d2fdefc222bc9e20a4a0106fa7b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..084b1b5e363311d1da35e75cabdef8f6ddc7db0e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/configuration_instructblip.py b/phivenv/Lib/site-packages/transformers/models/instructblip/configuration_instructblip.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b8323f15f0525887d178239b6b221dbb488d952
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblip/configuration_instructblip.py
@@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""InstructBLIP model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class InstructBlipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipVisionModel`]. It is used to
+    instantiate a InstructBLIP vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the InstructBLIP
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. to 1e-5): The epsilon used by the layer
+            normalization layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import InstructBlipVisionConfig, InstructBlipVisionModel
+
+    >>> # Initializing a InstructBlipVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipVisionConfig()
+
+    >>> # Initializing a InstructBlipVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "instructblip_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+
+class InstructBlipQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipQFormerModel`]. It is used to
+    instantiate a InstructBLIP Querying Transformer (Q-Former) model according to the specified arguments, defining the
+    model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the InstructBLIP [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
+    architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+
+    Note that [`InstructBlipQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Token id used for padding sequences.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+
+    Examples:
+
+    ```python
+    >>> from transformers import InstructBlipQFormerConfig, InstructBlipQFormerModel
+
+    >>> # Initializing a InstructBLIP Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipQFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipQFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "instructblip_qformer"
+    base_config_key = "qformer_config"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+
+class InstructBlipConfig(PretrainedConfig):
+    r"""
+    [`InstructBlipConfig`] is the configuration class to store the configuration of a
+    [`InstructBlipForConditionalGeneration`]. It is used to instantiate a InstructBLIP model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the InstructBLIP
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        image_token_index (`int`, *optional*):
+            Token index of special image token.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     InstructBlipVisionConfig,
+    ...     InstructBlipQFormerConfig,
+    ...     OPTConfig,
+    ...     InstructBlipConfig,
+    ...     InstructBlipForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a InstructBlipConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipConfig()
+
+    >>> # Initializing a InstructBlipForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a InstructBlipConfig from a InstructBlipVisionConfig, InstructBlipQFormerConfig and any PretrainedConfig
+
+    >>> # Initializing InstructBLIP vision, InstructBLIP Q-Former and language model configurations
+    >>> vision_config = InstructBlipVisionConfig()
+    >>> qformer_config = InstructBlipQFormerConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = InstructBlipConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "instructblip"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipQFormerConfig,
+        "vision_config": InstructBlipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        image_token_index=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+
+        self.vision_config = InstructBlipVisionConfig(**vision_config)
+        self.qformer_config = InstructBlipQFormerConfig(**qformer_config)
+        text_model_type = text_config.get("model_type", "opt")
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: InstructBlipVisionConfig,
+        qformer_config: InstructBlipQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`InstructBlipConfig`] (or a derived class) from a InstructBLIP vision model, Q-Former and
+        language model configurations.
+
+        Returns:
+            [`InstructBlipConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+
+__all__ = ["InstructBlipConfig", "InstructBlipQFormerConfig", "InstructBlipVisionConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/modeling_instructblip.py b/phivenv/Lib/site-packages/transformers/models/instructblip/modeling_instructblip.py
new file mode 100644
index 0000000000000000000000000000000000000000..c01e082ab1691afefeef4b434954e557ca6d0bb5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblip/modeling_instructblip.py
@@ -0,0 +1,1716 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch InstructBLIP model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
+from ..auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM
+from .configuration_instructblip import InstructBlipConfig, InstructBlipQFormerConfig, InstructBlipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the outputs of [`InstructBlipForConditionalGeneration`].
+    """
+)
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlip
+class InstructBlipForConditionalGenerationModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Language modeling loss from the language model.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head of the language model.
+    vision_outputs (`BaseModelOutputWithPooling`):
+        Outputs of the vision encoder.
+    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+        Outputs of the Q-Former (Querying Transformer).
+    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+        Outputs of the language model.
+    """
+
+    loss: Optional[tuple[torch.FloatTensor]] = None
+    logits: Optional[tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[tuple[torch.FloatTensor]] = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->InstructBlip
+class InstructBlipVisionEmbeddings(nn.Module):
+    def __init__(self, config: InstructBlipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding
+
+        class_pos_embed = self.position_embedding[:, :1]
+        patch_pos_embed = self.position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            position_embedding = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            position_embedding = self.position_embedding
+        embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+# Adapted from transformers.models.siglip.modeling_siglip.eager_attention_forward -> InstructBLIP doesn't cast attn weights to fp32
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2Attention with Blip2->InstructBlip
+class InstructBlipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.is_causal = False
+        self.attention_dropout = config.attention_dropout
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        attention_interface: Callable = eager_attention_forward
+
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=None,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.projection(attn_output)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output, None)
+        return outputs
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipMLP
+class InstructBlipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->InstructBlip
+class InstructBlipEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = InstructBlipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = InstructBlipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class InstructBlipPreTrainedModel(PreTrainedModel):
+    config: InstructBlipConfig
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _supports_attention_backend = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+
+    _no_split_modules = [
+        "InstructBlipQFormerEmbeddings",
+        "InstructBlipAttention",
+        "InstructBlipQFormerMultiHeadAttention",
+        "InstructBlipQFormerSelfOutput",
+    ]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, InstructBlipVisionEmbeddings):
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+        elif isinstance(module, (InstructBlipForConditionalGeneration, InstructBlipModel)):
+            module.query_tokens.data.zero_()
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip
+class InstructBlipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InstructBlipEncoderLayer`].
+
+    Args:
+        config (`InstructBlipConfig`):
+            The corresponding vision configuration for the `InstructBlipEncoder`.
+    """
+
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([InstructBlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlip, BLIP->INSTRUCTBLIP
+class InstructBlipVisionModel(InstructBlipPreTrainedModel):
+    main_input_name = "pixel_values"
+    config: InstructBlipVisionConfig
+
+    def __init__(self, config: InstructBlipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = InstructBlipVisionEmbeddings(config)
+        self.encoder = InstructBlipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class InstructBlipQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attention_scores_dtype = attention_scores.dtype
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores).to(attention_scores_dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->InstructBlipQFormer
+class InstructBlipQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->InstructBlip
+class InstructBlipQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = InstructBlipQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = InstructBlipQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->InstructBlipQFormer
+class InstructBlipQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->InstructBlipQFormer
+class InstructBlipQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class InstructBlipQFormerLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InstructBlipQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = InstructBlipQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate = InstructBlipQFormerIntermediate(config)
+        self.output = InstructBlipQFormerOutput(config)
+
+        self.intermediate_query = InstructBlipQFormerIntermediate(config)
+        self.output_query = InstructBlipQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                ).to(layer_output.device)
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->InstructBlip
+class InstructBlipQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [InstructBlipQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                query_length=query_length,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if query_length > 0 and layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class InstructBlipQFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = embeddings.to(self.layernorm.weight.dtype)
+        embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
+    instruction as input.
+    """
+
+    _supports_attention_backend = False  # adds position on attn weights before last matmul
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_flex_attn = False
+
+    def __init__(self, config: InstructBlipQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InstructBlipQFormerEmbeddings(config)
+
+        self.encoder = InstructBlipQFormerEncoder(config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`tuple[int]`):
+                The shape of the input to the model.
+            device: (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})",
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        query_embeds: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`Cache` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and query_embeds is None:
+            raise ValueError("You have to specify query_embeds when input_ids is None")
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    InstructBLIP base Model consisting of language model, qformer and vision encoder.
+    """
+)
+class InstructBlipModel(InstructBlipPreTrainedModel):
+    main_input_name = "pixel_values"
+    _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
+
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__(config)
+
+        self.vision_model = InstructBlipVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = InstructBlipQFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModel.from_config(config.text_config)
+
+        if self.language_model._no_split_modules is not None:
+            self._no_split_modules.extend(self.language_model._no_split_modules)
+
+        if self.language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(self.language_model._keep_in_fp32_modules)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + InstructBLIP + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, InstructBlipForConditionalGenerationModelOutput]:
+        r"""
+        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
+            to serve as text prompt, which the Q-Former model will encode.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        special_image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+
+        return InstructBlipForConditionalGenerationModelOutput(
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+    """
+)
+class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, GenerationMixin):
+    config: InstructBlipConfig
+    main_input_name = "pixel_values"
+
+    _can_compile_fullgraph = True
+    _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
+
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__(config)
+
+        self.vision_model = InstructBlipVisionModel._from_config(config.vision_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = InstructBlipQFormerModel._from_config(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        if language_model._no_split_modules is not None:
+            self._no_split_modules.extend(language_model._no_split_modules)
+
+        if language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
+
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    # Copied from transformers.models.instructblip.modeling_instructblip.InstructBlipModel._tie_weights
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    # Copied from transformers.models.instructblip.modeling_instructblip.InstructBlipModel._preprocess_accelerate
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + InstructBLIP + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.LongTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = False,
+    ):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+        """
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        if return_dict:
+            return language_model_inputs, vision_outputs, query_outputs
+        return language_model_inputs
+
+    def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, InstructBlipForConditionalGenerationModelOutput]:
+        r"""
+        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
+            to serve as text prompt, which the Q-Former model will encode.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
+            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        >>> prompt = "What is unusual about this image?"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
+
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     num_beams=5,
+        ...     max_length=256,
+        ...     min_length=1,
+        ...     top_p=0.9,
+        ...     repetition_penalty=1.5,
+        ...     length_penalty=1.0,
+        ...     temperature=1,
+        ... )
+        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        language_model_inputs, vision_outputs, query_outputs = self.get_image_features(
+            pixel_values,
+            qformer_input_ids=qformer_input_ids,
+            qformer_attention_mask=qformer_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+        vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs
+        query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        special_image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            if labels is not None:
+                loss = self.loss_function(
+                    logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+                )
+
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+                use_cache=use_cache,
+                **kwargs,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        return InstructBlipForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: Optional[torch.LongTensor] = None,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt to be fed to the Q-Former module.
+            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+                Whether to interpolate the positional encoding of the image embeddings.
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+
+        batch_size = pixel_values.shape[0]
+        language_model_inputs, vision_outputs, query_outputs = self.get_image_features(
+            pixel_values,
+            qformer_input_ids=qformer_input_ids,
+            qformer_attention_mask=qformer_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+
+        if inputs_embeds is None:
+            if input_ids is None:
+                image_tokens = [self.config.image_token_index] * self.config.num_query_tokens
+                start_tokens = image_tokens + [self.config.text_config.bos_token_id]
+                input_ids = torch.tensor([start_tokens], dtype=torch.long, device=pixel_values.device)
+                input_ids = input_ids.repeat(batch_size, 1)
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        special_image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+        if not self.language_model.config.is_encoder_decoder:
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
+
+        return outputs
+
+
+__all__ = [
+    "InstructBlipQFormerModel",
+    "InstructBlipPreTrainedModel",
+    "InstructBlipModel",
+    "InstructBlipForConditionalGeneration",
+    "InstructBlipVisionModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblip/processing_instructblip.py b/phivenv/Lib/site-packages/transformers/models/instructblip/processing_instructblip.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee860e45c836a293baa69d0f8acf10d6a94fd56
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblip/processing_instructblip.py
@@ -0,0 +1,190 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
+"""
+
+import os
+from typing import Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
+from ...utils import logging
+from ..auto import AutoTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+class InstructBlipProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+        },
+        "images_kwargs": {},
+    }
+
+
+class InstructBlipProcessor(ProcessorMixin):
+    r"""
+    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
+    processor.
+
+    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
+    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        qformer_tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):"
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
+    """
+
+    attributes = ["image_processor", "tokenizer", "qformer_tokenizer"]
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
+    tokenizer_class = "AutoTokenizer"
+    qformer_tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        if not hasattr(tokenizer, "image_token"):
+            self.image_token = AddedToken("<image>", normalized=False, special=True)
+            tokenizer.add_tokens([self.image_token], special_tokens=True)
+        else:
+            self.image_token = tokenizer.image_token
+        self.num_query_tokens = num_query_tokens
+
+        super().__init__(image_processor, tokenizer, qformer_tokenizer)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[InstructBlipProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        Args:
+            images (`ImageInput`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least images or text.")
+
+        output_kwargs = self._merge_kwargs(
+            InstructBlipProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        encoding = {}
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            qformer_text_encoding = self.qformer_tokenizer(text, **output_kwargs["text_kwargs"])
+            encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
+            encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
+
+            # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
+            if output_kwargs["text_kwargs"].get("max_length") is not None:
+                output_kwargs["text_kwargs"]["max_length"] -= self.num_query_tokens
+            text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+            if images is not None:
+                # Image tokens should not be padded/truncated or prepended with special BOS token
+                image_tokens = self.image_token.content * self.num_query_tokens
+                output_kwargs["text_kwargs"]["add_special_tokens"] = False
+                output_kwargs["text_kwargs"]["padding"] = False
+                output_kwargs["text_kwargs"]["truncation"] = False
+                image_text_encoding = self.tokenizer(image_tokens, **output_kwargs["text_kwargs"])
+                for k in text_encoding:
+                    text_encoding[k] = [image_text_encoding[k] + sample for sample in text_encoding[k]]
+            encoding.update(text_encoding)
+
+        if images is not None:
+            image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
+            encoding.update(image_encoding)
+
+        # Cast to desired return tensors type
+        encoding = BatchFeature(encoding, tensor_type=return_tensors)
+        return encoding
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        qformer_input_names = ["qformer_input_ids", "qformer_attention_mask"]
+        return tokenizer_input_names + image_processor_input_names + qformer_input_names
+
+    # overwrite to save the Q-Former tokenizer in a separate folder
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
+        self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
+
+        # We modify the attributes so that only the tokenizer and image processor are saved in the main folder
+        qformer_present = "qformer_tokenizer" in self.attributes
+        if qformer_present:
+            self.attributes.remove("qformer_tokenizer")
+
+        outputs = super().save_pretrained(save_directory, **kwargs)
+
+        if qformer_present:
+            self.attributes += ["qformer_tokenizer"]
+        return outputs
+
+    # overwrite to load the Q-Former tokenizer from a separate folder
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
+        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
+        processor.qformer_tokenizer = qformer_tokenizer
+        return processor
+
+
+__all__ = ["InstructBlipProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__init__.py b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb06450487cbea467b3c7be4be07ad524b47042
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_instructblipvideo import *
+    from .image_processing_instructblipvideo import *
+    from .modeling_instructblipvideo import *
+    from .processing_instructblipvideo import *
+    from .video_processing_instructblipvideo import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f805317e9a5d3b1d0b54e8a76e03be13eeebb99b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45490432da5d61a7d67b9e6f198fb95d34a05fab
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0583a4b2e52869d98e7f9fa2eec613f397ae154
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ebe20a42daa0ae888050b36810f552e53315dfe
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb047a6d21a5ebe22be253f1bfc47d25e7831e61
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dc606dee0eb19a14a0209567aad0cd9e4a8c4de
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd759f4ae14eab4b14a8b6a4016f9a73cd9bb309
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/configuration_instructblipvideo.py
new file mode 100644
index 0000000000000000000000000000000000000000..af2acc83387675e5bac3fcfa7c6ffe5c793838a0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/configuration_instructblipvideo.py
@@ -0,0 +1,345 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/instructblipvideo/modular_instructblipvideo.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_instructblipvideo.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class InstructBlipVideoVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipVideoVisionModel`]. It is used to
+    instantiate a InstructBlipVideo vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the InstructBlipVideo
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. to 1e-5): The epsilon used by the layer
+            normalization layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import InstructBlipVideoVisionConfig, InstructBlipVideoVisionModel
+
+    >>> # Initializing a InstructBlipVideoVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipVideoVisionConfig()
+
+    >>> # Initializing a InstructBlipVideoVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipVideoVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "instructblipvideo_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+
+class InstructBlipVideoQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipVideoQFormerModel`]. It is used to
+    instantiate a InstructBlipVideo Querying Transformer (Q-Former) model according to the specified arguments, defining the
+    model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the InstructBlipVideo [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
+    architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+
+    Note that [`InstructBlipVideoQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Token id used for padding sequences.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+
+    Examples:
+
+    ```python
+    >>> from transformers import InstructBlipVideoQFormerConfig, InstructBlipVideoQFormerModel
+
+    >>> # Initializing a InstructBlipVideo Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipVideoQFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipVideoQFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "instructblipvideo_qformer"
+    base_config_key = "qformer_config"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+
+class InstructBlipVideoConfig(PretrainedConfig):
+    r"""
+    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
+    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the Instructblipvideo
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        video_token_index (`int`, *optional*):
+            Token index of special video token.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     InstructBlipVideoVisionConfig,
+    ...     InstructBlipVideoQFormerConfig,
+    ...     OPTConfig,
+    ...     InstructBlipVideoConfig,
+    ...     InstructBlipVideoForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipVideoConfig()
+
+    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipVideoForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig
+
+    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
+    >>> vision_config = InstructBlipVideoVisionConfig()
+    >>> qformer_config = InstructBlipVideoQFormerConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "instructblipvideo"
+    attribute_map = {
+        "video_token_id": "video_token_index",
+    }
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipVideoQFormerConfig,
+        "vision_config": InstructBlipVideoVisionConfig,
+    }
+
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        video_token_index=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the InstructBlipVideoVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+
+        self.vision_config = InstructBlipVideoVisionConfig(**vision_config)
+        self.qformer_config = InstructBlipVideoQFormerConfig(**qformer_config)
+        text_model_type = text_config.get("model_type", "opt")
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.num_query_tokens = num_query_tokens
+        self.video_token_index = video_token_index
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: InstructBlipVideoVisionConfig,
+        qformer_config: InstructBlipVideoQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
+        language model configurations.
+
+        Returns:
+            [`InstructBlipVideoConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+
+__all__ = ["InstructBlipVideoConfig", "InstructBlipVideoQFormerConfig", "InstructBlipVideoVisionConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c812422af241b777ef1b25a3f7a4eb73ff78a9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -0,0 +1,332 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Image processor class for InstructBLIPVideo. Largely copy of Blip2Processor with addition of a video processing abilities
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+from ...video_utils import VideoInput, make_batched_videos
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO (raushan): processor can be removed after v5 release. Kept for backwards compatibility
+# Copied from transformers.models.blip.image_processing_blip.BlipImageProcessor with Blip->InstructBlipVideo, BLIP->InstructBLIPVideo
+class InstructBlipVideoImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a InstructBLIPVideo image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Ignore copy
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: VideoInput = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Preprocess a video or batch of images/videos.
+
+        Args:
+            videos (`VideoInput`):
+                Video frames to preprocess. Expects a single or batch of videos as a list of frames with pixel values
+                ranging from 0 to 255. If passing in video with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the video.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the video after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the video. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the video values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the video by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the video.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the video by if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the video by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        videos = make_batched_videos(images)
+        logger.warning(
+            "`InstructBlipVideoImageProcessor` is deprecated and will be removed in v5.0. "
+            "We recommend to load an instance of `InstructBlipVideoVideoProcessor` to process videos for the model. "
+        )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        pixel_values = [
+            [
+                self._preprocess_image(
+                    image=frame,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    do_convert_rgb=do_convert_rgb,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for frame in video
+            ]
+            for video in videos
+        ]
+
+        encoded_outputs = BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
+        return encoded_outputs
+
+    # Ignore copy
+    def _preprocess_image(
+        self,
+        image: ImageInput = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            image = convert_to_rgb(image)
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled video frames. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+        return image
+
+
+__all__ = ["InstructBlipVideoImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b554e69af9e8efdaf495fa83647752d4ac327ea9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -0,0 +1,1748 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/instructblipvideo/modular_instructblipvideo.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_instructblipvideo.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
+from ..auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM
+from .configuration_instructblipvideo import (
+    InstructBlipVideoConfig,
+    InstructBlipVideoQFormerConfig,
+    InstructBlipVideoVisionConfig,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class InstructBlipVideoVisionEmbeddings(nn.Module):
+    def __init__(self, config: InstructBlipVideoVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding
+
+        class_pos_embed = self.position_embedding[:, :1]
+        patch_pos_embed = self.position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            position_embedding = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            position_embedding = self.position_embedding
+        embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+@auto_docstring
+class InstructBlipVideoPreTrainedModel(PreTrainedModel):
+    config: InstructBlipVideoConfig
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _supports_attention_backend = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+
+    _no_split_modules = [
+        "InstructBlipVideoQFormerEmbeddings",
+        "InstructBlipVideoAttention",
+        "InstructBlipVideoQFormerMultiHeadAttention",
+        "InstructBlipVideoQFormerSelfOutput",
+    ]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, InstructBlipVideoVisionEmbeddings):
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+        elif isinstance(module, (InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel)):
+            module.query_tokens.data.zero_()
+
+
+# Adapted from transformers.models.siglip.modeling_siglip.eager_attention_forward -> InstructBlipVideo doesn't cast attn weights to fp32
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class InstructBlipVideoAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.is_causal = False
+        self.attention_dropout = config.attention_dropout
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        attention_interface: Callable = eager_attention_forward
+
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=None,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.projection(attn_output)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output, None)
+        return outputs
+
+
+class InstructBlipVideoMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class InstructBlipVideoEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: InstructBlipVideoConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = InstructBlipVideoAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = InstructBlipVideoMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class InstructBlipVideoEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InstructBlipVideoEncoderLayer`].
+
+    Args:
+        config (`InstructBlipVideoConfig`):
+            The corresponding vision configuration for the `InstructBlipVideoEncoder`.
+    """
+
+    def __init__(self, config: InstructBlipVideoConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([InstructBlipVideoEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel):
+    main_input_name = "pixel_values"
+    config: InstructBlipVideoVisionConfig
+
+    def __init__(self, config: InstructBlipVideoVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = InstructBlipVideoVisionEmbeddings(config)
+        self.encoder = InstructBlipVideoEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class InstructBlipVideoQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attention_scores_dtype = attention_scores.dtype
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores).to(attention_scores_dtype)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class InstructBlipVideoQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class InstructBlipVideoQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = InstructBlipVideoQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = InstructBlipVideoQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class InstructBlipVideoQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class InstructBlipVideoQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class InstructBlipVideoQFormerLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InstructBlipVideoQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = InstructBlipVideoQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate = InstructBlipVideoQFormerIntermediate(config)
+        self.output = InstructBlipVideoQFormerOutput(config)
+
+        self.intermediate_query = InstructBlipVideoQFormerIntermediate(config)
+        self.output_query = InstructBlipVideoQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                ).to(layer_output.device)
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class InstructBlipVideoQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [InstructBlipVideoQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                query_length=query_length,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if query_length > 0 and layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class InstructBlipVideoQFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = embeddings.to(self.layernorm.weight.dtype)
+        embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
+    instruction as input.
+    """
+
+    _supports_attention_backend = False  # adds position on attn weights before last matmul
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_flex_attn = False
+
+    def __init__(self, config: InstructBlipVideoQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InstructBlipVideoQFormerEmbeddings(config)
+
+        self.encoder = InstructBlipVideoQFormerEncoder(config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`tuple[int]`):
+                The shape of the input to the model.
+            device: (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})",
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        query_embeds: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`Cache` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and query_embeds is None:
+            raise ValueError("You have to specify query_embeds when input_ids is None")
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
+    """
+)
+class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Language modeling loss from the language model.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head of the language model.
+    vision_outputs (`BaseModelOutputWithPooling`):
+        Outputs of the vision encoder.
+    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+        Outputs of the Q-Former (Querying Transformer).
+    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+        Outputs of the language model.
+    """
+
+    loss: Optional[tuple[torch.FloatTensor]] = None
+    logits: Optional[tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[tuple[torch.FloatTensor]] = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
+    """
+)
+class InstructBlipVideoModel(InstructBlipVideoPreTrainedModel):
+    main_input_name = "pixel_values"
+    _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
+
+    def __init__(self, config: InstructBlipVideoConfig):
+        super().__init__(config)
+
+        self.vision_model = InstructBlipVideoVisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = InstructBlipVideoQFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = AutoModel.from_config(config.text_config)
+
+        if self.language_model._no_split_modules is not None:
+            self._no_split_modules.extend(self.language_model._no_split_modules)
+
+        if self.language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(self.language_model._keep_in_fp32_modules)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + InstructBlipVideo + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
+        r"""
+        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
+            to serve as text prompt, which the Q-Former model will encode.
+
+            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # we process in a batched way, later unbatch it back (video has frames=4 always)
+        batch_size, frames, channel, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width)
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+
+        qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+
+        # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
+        language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+            special_image_mask = input_ids == self.config.video_token_id
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+        else:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+
+        return InstructBlipVideoForConditionalGenerationModelOutput(
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+    """
+)
+class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel, GenerationMixin):
+    config: InstructBlipVideoConfig
+    main_input_name = "pixel_values"
+
+    _can_compile_fullgraph = True
+    _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
+
+    def __init__(self, config: InstructBlipVideoConfig):
+        super().__init__(config)
+
+        self.vision_model = InstructBlipVideoVisionModel._from_config(config.vision_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = InstructBlipVideoQFormerModel._from_config(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        if language_model._no_split_modules is not None:
+            self._no_split_modules.extend(language_model._no_split_modules)
+
+        if language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
+
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + InstructBlipVideo + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.LongTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = False,
+    ):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+        """
+        pass
+
+    def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.video_token_id
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
+        r"""
+        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
+            The sequence used as a prompt to be fed to the Q-Former module.
+        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+            Mask to avoid performing attention on padding token indices.
+
+        Examples:
+
+        ```python
+        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
+        >>> import torch
+        >>> from huggingface_hub import hf_hub_download
+        >>> import av
+        >>> import numpy as np
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`list[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
+        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+
+        >>> file_path = hf_hub_download(
+        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
+        >>> total_frames = container.streams.video[0].frames
+        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
+        >>> clip = read_video_pyav(container, indices)
+
+        >>> prompt = "What is happening in the video?"
+        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)
+
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     num_beams=5,
+        ...     max_length=256,
+        ...     repetition_penalty=1.5,
+        ...     length_penalty=1.0,
+        ... )
+        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
+            pixel_values,
+            qformer_input_ids=qformer_input_ids,
+            qformer_attention_mask=qformer_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+        vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs
+        query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        special_image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            if labels is not None:
+                loss = self.loss_function(
+                    logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+                )
+
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+                use_cache=use_cache,
+                **kwargs,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        return InstructBlipVideoForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: Optional[torch.LongTensor] = None,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        r"""
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
+                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
+            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt to be fed to the Q-Former module.
+            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+                Whether to interpolate the positional encoding of the image embeddings.
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+
+        batch_size = pixel_values.shape[0]
+        language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
+            pixel_values,
+            qformer_input_ids=qformer_input_ids,
+            qformer_attention_mask=qformer_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+
+        if inputs_embeds is None:
+            if input_ids is None:
+                video_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4
+                start_tokens = video_tokens + [self.config.text_config.bos_token_id]
+                input_ids = torch.tensor([start_tokens], dtype=torch.long, device=pixel_values.device)
+                input_ids = input_ids.repeat(batch_size, 1)
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        special_image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+        if not self.language_model.config.is_encoder_decoder:
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
+
+        return outputs
+
+    def get_video_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.LongTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = False,
+    ):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+        """
+        # step 1: forward the images through the vision encoder,
+        # we process in a batched way, later unbatch it back (video has frames=4 always)
+        batch_size, frames, channel, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width)
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+
+        qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+
+        # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
+        language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
+        if return_dict:
+            return language_model_inputs, vision_outputs, query_outputs
+        return language_model_inputs
+
+
+__all__ = [
+    "InstructBlipVideoVisionModel",
+    "InstructBlipVideoPreTrainedModel",
+    "InstructBlipVideoQFormerModel",
+    "InstructBlipVideoModel",
+    "InstructBlipVideoForConditionalGeneration",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.py b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff30263700cf34502ec4b79da8e85eec356d9868
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -0,0 +1,614 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+
+from transformers.models.instructblip.configuration_instructblip import (
+    InstructBlipQFormerConfig,
+    InstructBlipVisionConfig,
+)
+from transformers.models.instructblip.modeling_instructblip import (
+    InstructBlipForConditionalGeneration,
+    InstructBlipForConditionalGenerationModelOutput,
+    InstructBlipModel,
+    InstructBlipPreTrainedModel,
+    InstructBlipQFormerModel,
+    InstructBlipVisionModel,
+    TransformersKwargs,
+)
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class InstructBlipVideoVisionConfig(InstructBlipVisionConfig):
+    pass
+
+
+class InstructBlipVideoQFormerConfig(InstructBlipQFormerConfig):
+    pass
+
+
+class InstructBlipVideoConfig(PretrainedConfig):
+    r"""
+    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
+    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the Instructblipvideo
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        video_token_index (`int`, *optional*):
+            Token index of special video token.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     InstructBlipVideoVisionConfig,
+    ...     InstructBlipVideoQFormerConfig,
+    ...     OPTConfig,
+    ...     InstructBlipVideoConfig,
+    ...     InstructBlipVideoForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipVideoConfig()
+
+    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipVideoForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig
+
+    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
+    >>> vision_config = InstructBlipVideoVisionConfig()
+    >>> qformer_config = InstructBlipVideoQFormerConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "instructblipvideo"
+    attribute_map = {
+        "video_token_id": "video_token_index",
+    }
+    sub_configs = {
+        "text_config": AutoConfig,
+        "qformer_config": InstructBlipVideoQFormerConfig,
+        "vision_config": InstructBlipVideoVisionConfig,
+    }
+
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        video_token_index=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the InstructBlipVideoVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+
+        self.vision_config = InstructBlipVideoVisionConfig(**vision_config)
+        self.qformer_config = InstructBlipVideoQFormerConfig(**qformer_config)
+        text_model_type = text_config.get("model_type", "opt")
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.num_query_tokens = num_query_tokens
+        self.video_token_index = video_token_index
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: InstructBlipVideoVisionConfig,
+        qformer_config: InstructBlipVideoQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
+        language model configurations.
+
+        Returns:
+            [`InstructBlipVideoConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+
+class InstructBlipVideoPreTrainedModel(InstructBlipPreTrainedModel):
+    pass
+
+
+class InstructBlipVideoVisionModel(InstructBlipVisionModel):
+    pass
+
+
+class InstructBlipVideoQFormerModel(InstructBlipQFormerModel):
+    pass
+
+
+class InstructBlipVideoForConditionalGenerationModelOutput(InstructBlipForConditionalGenerationModelOutput):
+    pass
+
+
+class InstructBlipVideoModel(InstructBlipModel):
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # we process in a batched way, later unbatch it back (video has frames=4 always)
+        batch_size, frames, channel, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width)
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+
+        qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+
+        # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
+        language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+            special_image_mask = input_ids == self.config.video_token_id
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+        else:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+
+        return InstructBlipVideoForConditionalGenerationModelOutput(
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGeneration):
+    def get_video_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.LongTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = False,
+    ):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+        """
+        # step 1: forward the images through the vision encoder,
+        # we process in a batched way, later unbatch it back (video has frames=4 always)
+        batch_size, frames, channel, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width)
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+
+        qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+
+        # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
+        language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
+        if return_dict:
+            return language_model_inputs, vision_outputs, query_outputs
+        return language_model_inputs
+
+    # Model supports only videos
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.LongTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = False,
+    ):
+        pass
+
+    def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.video_token_id
+
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        return special_image_mask
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
+        r"""
+        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
+            The sequence used as a prompt to be fed to the Q-Former module.
+        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+            Mask to avoid performing attention on padding token indices.
+
+        Examples:
+
+        ```python
+        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
+        >>> import torch
+        >>> from huggingface_hub import hf_hub_download
+        >>> import av
+        >>> import numpy as np
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`list[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
+        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+
+        >>> file_path = hf_hub_download(
+        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
+        >>> total_frames = container.streams.video[0].frames
+        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
+        >>> clip = read_video_pyav(container, indices)
+
+        >>> prompt = "What is happening in the video?"
+        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)
+
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     num_beams=5,
+        ...     max_length=256,
+        ...     repetition_penalty=1.5,
+        ...     length_penalty=1.0,
+        ... )
+        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
+            pixel_values,
+            qformer_input_ids=qformer_input_ids,
+            qformer_attention_mask=qformer_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+        vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs
+        query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        special_image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                use_cache=use_cache,
+                **kwargs,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            if labels is not None:
+                loss = self.loss_function(
+                    logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+                )
+
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+                use_cache=use_cache,
+                **kwargs,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        return InstructBlipVideoForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: Optional[torch.LongTensor] = None,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        r"""
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
+                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
+            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt to be fed to the Q-Former module.
+            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+                Whether to interpolate the positional encoding of the image embeddings.
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+
+        batch_size = pixel_values.shape[0]
+        language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
+            pixel_values,
+            qformer_input_ids=qformer_input_ids,
+            qformer_attention_mask=qformer_attention_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+
+        if inputs_embeds is None:
+            if input_ids is None:
+                video_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4
+                start_tokens = video_tokens + [self.config.text_config.bos_token_id]
+                input_ids = torch.tensor([start_tokens], dtype=torch.long, device=pixel_values.device)
+                input_ids = input_ids.repeat(batch_size, 1)
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        special_image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+        if not self.language_model.config.is_encoder_decoder:
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
+
+        return outputs
+
+
+__all__ = [
+    "InstructBlipVideoConfig",
+    "InstructBlipVideoQFormerConfig",
+    "InstructBlipVideoVisionConfig",
+    "InstructBlipVideoVisionModel",
+    "InstructBlipVideoPreTrainedModel",
+    "InstructBlipVideoQFormerModel",
+    "InstructBlipVideoModel",
+    "InstructBlipVideoForConditionalGeneration",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.py b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a518c3a1a19cfb2982144fd56c4fd3e31dd121c5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
+"""
+
+import os
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import (
+    AddedToken,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
+from ...video_utils import VideoInput
+from ..auto import AutoTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+class InstructBlipVideoProcessor(ProcessorMixin):
+    r"""
+    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
+    processor.
+
+    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
+    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.
+
+    Args:
+        video_processor (`InstructBlipVideoVideoProcessor`):
+            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        qformer_tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
+    """
+
+    attributes = ["video_processor", "tokenizer", "qformer_tokenizer"]
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+    qformer_tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        if not hasattr(tokenizer, "video_token"):
+            self.video_token = AddedToken("<video>", normalized=False, special=True)
+            tokenizer.add_tokens([self.video_token], special_tokens=True)
+        else:
+            self.video_token = tokenizer.video_token
+        self.num_query_tokens = num_query_tokens
+        super().__init__(video_processor, tokenizer, qformer_tokenizer)
+
+    def __call__(
+        self,
+        images: VideoInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of images or text.")
+
+        encoding = {}
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            qformer_text_encoding = self.qformer_tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
+            encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
+
+            # We need this hacky manipulation because BLIP expects image tokens to be at the beginning even before BOS token
+            # InstrucBLIP works with 4 frames only
+            if max_length is not None:
+                max_length -= self.num_query_tokens
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=None,  # required to concatenate below
+                **kwargs,
+            )
+
+            if images is not None:
+                video_tokens = self.video_token.content * self.num_query_tokens * 4
+                video_text_encoding = self.tokenizer(
+                    video_tokens,
+                    add_special_tokens=False,  # required to concatenate below
+                    return_attention_mask=return_attention_mask,
+                    return_overflowing_tokens=return_overflowing_tokens,
+                    return_special_tokens_mask=return_special_tokens_mask,
+                    return_offsets_mapping=return_offsets_mapping,
+                    return_token_type_ids=return_token_type_ids,
+                    return_length=return_length,
+                    return_tensors=None,
+                )
+                for k in text_encoding:
+                    text_encoding[k] = [video_text_encoding[k] + sample for sample in text_encoding[k]]
+            encoding.update(text_encoding)
+
+        if images is not None:
+            image_encoding = self.video_processor(images, return_tensors=return_tensors)
+            encoding.update(image_encoding)
+
+        encoding = BatchFeature(encoding, tensor_type=return_tensors)
+        return encoding
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        video_processor_input_names = self.video_processor.model_input_names
+        qformer_input_names = ["qformer_input_ids", "qformer_attention_mask"]
+        return tokenizer_input_names + video_processor_input_names + qformer_input_names
+
+    # overwrite to save the Q-Former tokenizer in a separate folder
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
+        self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
+
+        # We modify the attributes so that only the tokenizer and image processor are saved in the main folder
+        qformer_present = "qformer_tokenizer" in self.attributes
+        if qformer_present:
+            self.attributes.remove("qformer_tokenizer")
+
+        outputs = super().save_pretrained(save_directory, **kwargs)
+
+        if qformer_present:
+            self.attributes += ["qformer_tokenizer"]
+        return outputs
+
+    # overwrite to load the Q-Former tokenizer from a separate folder
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
+        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
+        processor.qformer_tokenizer = qformer_tokenizer
+        return processor
+
+
+__all__ = ["InstructBlipVideoProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/instructblipvideo/video_processing_instructblipvideo.py b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
new file mode 100644
index 0000000000000000000000000000000000000000..805ecda064974df70bd90eb2eff9a1bd7bcf94af
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Video processor class for InstructBLIPVideo
+"""
+
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    SizeDict,
+)
+from ...processing_utils import Unpack, VideosKwargs
+from ...utils import (
+    TensorType,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+)
+from ...utils.import_utils import requires
+from ...video_processing_utils import BaseVideoProcessor
+from ...video_utils import group_videos_by_shape, reorder_videos
+
+
+if is_vision_available():
+    from ...image_utils import PILImageResampling
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+if is_torch_available():
+    import torch
+
+
+class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs): ...
+
+
+@requires(backends=("torchvision",))
+class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    default_to_square = True
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
+    valid_kwargs = InstructBlipVideoVideoProcessorInitKwargs
+    model_input_names = ["pixel_values"]
+
+    def __init__(self, **kwargs: Unpack[InstructBlipVideoVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+
+    def _preprocess(
+        self,
+        videos: list["torch.Tensor"],
+        do_convert_rgb: bool,
+        do_resize: bool,
+        size: SizeDict,
+        size_divisor: Optional[int],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        do_pad: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group videos by size for batched resizing
+        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
+        resized_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_convert_rgb:
+                stacked_videos = self.convert_to_rgb(stacked_videos)
+            if do_resize:
+                stacked_videos = self.resize(
+                    stacked_videos, size=size, size_divisor=size_divisor, interpolation=interpolation
+                )
+            resized_videos_grouped[shape] = stacked_videos
+        resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
+
+        # Group videos by size for further processing
+        # Needed in case do_resize is False, or resize returns videos with different sizes
+        grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
+        processed_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_center_crop:
+                stacked_videos = self.center_crop(stacked_videos, crop_size)
+            # Fused rescale and normalize
+            stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_videos_grouped[shape] = stacked_videos
+
+        processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
+        processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
+
+        return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
+
+
+__all__ = ["InstructBlipVideoVideoProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/__init__.py b/phivenv/Lib/site-packages/transformers/models/internvl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d4ffe7befafddc423a58d41934a260137da0a13
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/internvl/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_internvl import *
+    from .modeling_internvl import *
+    from .processing_internvl import *
+    from .video_processing_internvl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9792086cc4fab082b18705d2ef5996af0db6ff51
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37e422c9bcd1844a43e42ba0af5feb67a9f7aec7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69853f88eac4d3b00abc0f571bda80fdea92c861
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bacb4b157faa89fe12d56d165ba69e2be110836
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d1bf1710d396687ebdc1e6b8f142889ab0242ee
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6c4a44f29690a0b56b7b4897dd9cb89be33f344
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/configuration_internvl.py b/phivenv/Lib/site-packages/transformers/models/internvl/configuration_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..17be5388b6ab7dcbf596c2deee69b4a467755fe2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/internvl/configuration_internvl.py
@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class InternVLVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVLVisionModel`]. It is used to instantiate an InternVLVisionModel
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
+    a similar configuration to that of the InternVL3-1B.
+    e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf)
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply normalization to the queries and keys before the attention operation.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the projection layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The type of normalization to use in the encoder. Can be `"layer_norm"` or `"rms_norm"`.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int` or `list[int]`, *optional*, defaults to `[448, 448]`):
+            The size (resolution) of each image.
+        patch_size (`int` or `list[int]`, *optional*, defaults to `[14, 14]`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to use BERT-style absolute position embeddings.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.1):
+            Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
+        use_mean_pooling (`bool`, *optional*, defaults to `True`):
+            Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
+            CLS token, before applying the classification head.
+
+    Example:
+
+    ```python
+    >>> from transformers import InternVLVisionConfig, InternVLVisionModel
+
+    >>> # Initializing a InternVLVisionModel OpenGVLab/InternVL3-1B-hf style configuration
+    >>> configuration = InternVLVisionConfig()
+
+    >>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration
+    >>> model = InternVLVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "internvl_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        attention_bias=False,
+        use_qk_norm=False,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_dropout=0.0,
+        projection_dropout=0.0,
+        initializer_range=0.02,
+        norm_type="layer_norm",
+        layer_norm_eps=1e-06,
+        image_size=[448, 448],
+        patch_size=[14, 14],
+        num_channels=3,
+        use_mask_token=False,
+        use_absolute_position_embeddings=True,
+        layer_scale_init_value=0.1,
+        use_mean_pooling=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_bias = attention_bias
+        self.use_qk_norm = use_qk_norm
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout = attention_dropout
+        self.projection_dropout = projection_dropout
+        self.initializer_range = initializer_range
+        self.norm_type = norm_type
+        self.layer_norm_eps = layer_norm_eps
+
+        image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, (list, tuple)) else (patch_size, patch_size)
+        self.image_size = image_size
+        self.patch_size = patch_size
+
+        self.num_channels = num_channels
+        self.use_mask_token = use_mask_token
+        self.use_absolute_position_embeddings = use_absolute_position_embeddings
+        self.layer_scale_init_value = layer_scale_init_value
+        self.use_mean_pooling = use_mean_pooling
+
+
+class InternVLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVLForConditionalGeneration`]. It is used to instantiate a
+    InternVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of InternVL3-1B.
+    e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `InternVisonConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
+            The config object or dictionary of the text backbone.
+        image_token_id (`int`, *optional*, defaults to 151667):
+            The image token index to encode the image prompt.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            Number of image tokens to use per image patch.
+        downsample_ratio (`float`, *optional*, defaults to 0.5):
+            Factor by which to downsample the image.
+        projector_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the projector.
+        vision_feature_layer (`int`, *optional*, defaults to -1):
+            The index of the layer to use as the image features.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+
+    ```python
+    >>> from transformers import InternVLForConditionalGeneration, InternVLConfig
+
+    >>> # Initializing a InternVL style configuration
+    >>> configuration = InternVLConfig()
+
+    >>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration
+    >>> model = InternVLForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "internvl"
+    sub_configs = {"text_config": AutoConfig, "vision_config": InternVLVisionConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_id=151667,
+        image_seq_length=256,
+        downsample_ratio=0.5,
+        projector_hidden_act="gelu",
+        vision_feature_layer=-1,
+        vision_feature_select_strategy="default",
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.image_seq_length = image_seq_length
+        self.downsample_ratio = downsample_ratio
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_layer = vision_feature_layer
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+
+        if isinstance(vision_config, dict):
+            self.vision_config = InternVLVisionConfig(**vision_config)
+        elif isinstance(vision_config, InternVLVisionConfig):
+            self.vision_config = vision_config
+        elif vision_config is None:
+            self.vision_config = InternVLVisionConfig()
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "qwen2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"]()
+
+        self.text_config = text_config
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["InternVLVisionConfig", "InternVLConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/modeling_internvl.py b/phivenv/Lib/site-packages/transformers/models/internvl/modeling_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd47c73622404ca642444c1e2c8ea8df0cc1e4a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/internvl/modeling_internvl.py
@@ -0,0 +1,1006 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/internvl/modular_internvl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_internvl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_int
+from ..auto import AutoModel
+from .configuration_internvl import InternVLConfig, InternVLVisionConfig
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class InternVLVisionRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        InternVLVisionRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = key
+    value_states = value
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # No upcasting of the attention weights to float32 in this implementation
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class InternVLVisionAttention(nn.Module):
+    """Attention Class for InternVL Vision Encoder"""
+
+    def __init__(self, config: InternVLVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        proj_dropout = config.projection_dropout
+        qk_norm = config.use_qk_norm
+
+        # Needed for flash attention
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim)
+        self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity()
+
+        self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+        self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ):
+        batch_size, seq_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=False,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
+
+        output = self.projection_layer(attn_output)
+        output = self.projection_dropout(output)
+
+        outputs = (output, attn_weights) if output_attentions else (output, None)
+        return outputs
+
+
+@auto_docstring
+class InternVLVisionPreTrainedModel(PreTrainedModel):
+    config: InternVLVisionConfig
+    base_model_prefix = "internvl_vision"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["InternVLVisionLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        super()._init_weights(module)
+        if isinstance(module, InternVLVisionEmbeddings):
+            module.cls_token.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, InternVLVisionLayer):
+            module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+            module.lambda_2.data.fill_(self.config.layer_scale_init_value)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`InternVLVisionModel`].
+    """
+)
+class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
+        will be returned.
+    """
+
+
+class InternVLVisionPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values)
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class InternVLVisionEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+
+    """
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        else:
+            self.mask_token = None
+        self.patch_embeddings = InternVLVisionPatchEmbeddings(config)
+        self.patch_size = config.patch_size
+        self.image_size = (
+            config.image_size
+            if isinstance(config.image_size, collections.abc.Iterable)
+            else (config.image_size, config.image_size)
+        )
+        num_patches = self.patch_embeddings.num_patches
+        if config.use_absolute_position_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        else:
+            self.position_embeddings = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternVLVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm}
+
+
+class InternVLVisionLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InternVLVisionAttention(config)
+        self.mlp = InternVLVisionMLP(config)
+        # InternVL uses different layernorm implementations for different models
+        self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+
+        init_values = config.layer_scale_init_value
+        self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
+        attention_output, attention_weights = self.attention(
+            self.layernorm_before(hidden_states),  # in InternVLVision, layernorm is applied before self-attention
+            output_attentions=output_attentions,
+        )
+
+        attention_output = self.lambda_1 * attention_output
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in InternVLVision, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+
+        layer_output = self.mlp(layer_output)
+        layer_output = self.dropout(layer_output)
+
+        if self.lambda_2 is not None:
+            layer_output = self.lambda_2 * layer_output
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        return layer_output, attention_weights
+
+
+class InternVLVisionEncoder(nn.Module):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@auto_docstring
+class InternVLVisionModel(InternVLVisionPreTrainedModel):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVLVisionEmbeddings(config)
+        self.encoder = InternVLVisionEncoder(config)
+
+        self.layernorm = (
+            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[tuple, InternVLVisionModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        return InternVLVisionModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class InternVLPreTrainedModel(PreTrainedModel):
+    config: InternVLConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+class InternVLMultiModalProjector(nn.Module):
+    def __init__(self, config: InternVLConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.layer_norm(image_features)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for InternVL outputs, with hidden states and attentions.
+    """
+)
+class InternVLModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class InternVLModel(InternVLPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: InternVLConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = InternVLMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+            vision_feature_layer (`int` or `list[int]`):
+                Layer index or list of layer indices to extract features from.
+        Returns:
+            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+
+        downsample_ratio = self.config.downsample_ratio
+        if vision_feature_layer == -1:
+            vision_features = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        else:
+            vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            vision_features = vision_features[:, 1:, :]
+
+        # Calculate dimensions based on vision features
+        channels = vision_features.shape[1]
+        feature_size = int(channels**0.5)
+        batch_size = vision_features.shape[0]
+
+        # Reshape tensor to spatial dimensions
+        vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1)
+
+        # Apply downsampling using pixel shuffle
+        vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio)
+
+        # Reshape tensor to prepare for projection
+        vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1])
+
+        # Project features through multi-modal projector
+        vision_features = self.multi_modal_projector(vision_features)
+        return vision_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, InternVLModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return InternVLModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5):
+        """Perform pixel shuffle downsampling on vision features.
+
+        Args:
+            vision_features (`torch.Tensor`):
+                Input tensor of shape (batch_size, width, height, channels).
+            scale_factor (`float`, *optional*, defaults to `0.5`):
+                Factor by which to downsample. Default is 0.5, which halves the dimensions.
+
+        Returns:
+            vision_features (`torch.Tensor`):
+                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
+        """
+        batch_size, width, height, channels = vision_features.size()
+
+        if height % scale_factor != 0 or width % scale_factor != 0:
+            raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.")
+
+        # Reshape to allow downsampling
+        vision_features = vision_features.view(
+            batch_size, width, int(height * scale_factor), int(channels / scale_factor)
+        )
+        # Permute dimensions to align downsampled axis correctly
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        # Reshape to achieve final downsampled dimensions
+        vision_features = vision_features.view(
+            batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2))
+        )
+
+        # Swap height and width back for proper orientation
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        return vision_features
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for InternVL causal language model (or autoregressive) outputs.
+    """
+)
+class InternVLCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The INTERNVL model which consists of a vision backbone and a language model.
+    """
+)
+class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: InternVLConfig):
+        super().__init__(config)
+        self.model = InternVLModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            **kwargs,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, InternVLCausalLMOutputWithPast]:
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+        >>> torch_device = "cuda"
+        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
+        >>> model = AutoModelForImageTextToText.from_pretrained(
+        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
+        ... )
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+        ...             },
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+        ...             },
+        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
+        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
+        The images depict the Statue of Liberty and the Golden Gate Bridge.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return InternVLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = [
+    "InternVLVisionPreTrainedModel",
+    "InternVLVisionModel",
+    "InternVLPreTrainedModel",
+    "InternVLModel",
+    "InternVLForConditionalGeneration",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/modular_internvl.py b/phivenv/Lib/site-packages/transformers/models/internvl/modular_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e0757d6cf0cd6060493870261e11f1e185cbe48
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/internvl/modular_internvl.py
@@ -0,0 +1,698 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, logging, torch_int
+from ..clip.modeling_clip import CLIPMLP
+from ..janus.modeling_janus import JanusVisionAttention
+from ..llama.modeling_llama import LlamaRMSNorm
+from ..llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
+    LlavaForConditionalGeneration,
+    LlavaModel,
+    LlavaModelOutputWithPast,
+    LlavaPreTrainedModel,
+)
+from .configuration_internvl import InternVLConfig, InternVLVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = key
+    value_states = value
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # No upcasting of the attention weights to float32 in this implementation
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class InternVLVisionRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class InternVLVisionAttention(JanusVisionAttention):
+    def __init__(self, config: InternVLVisionConfig):
+        super().__init__(config)
+        del self.num_key_value_groups
+
+        # Needed for flash attention
+        self.is_causal = False
+        qk_norm = config.use_qk_norm
+
+        self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+        self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ):
+        batch_size, seq_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=False,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
+
+        output = self.projection_layer(attn_output)
+        output = self.projection_dropout(output)
+
+        outputs = (output, attn_weights) if output_attentions else (output, None)
+        return outputs
+
+
+@auto_docstring
+class InternVLVisionPreTrainedModel(PreTrainedModel):
+    config: InternVLVisionConfig
+    base_model_prefix = "internvl_vision"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["InternVLVisionLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        super()._init_weights(module)
+        if isinstance(module, InternVLVisionEmbeddings):
+            module.cls_token.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, InternVLVisionLayer):
+            module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+            module.lambda_2.data.fill_(self.config.layer_scale_init_value)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`InternVLVisionModel`].
+    """
+)
+class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
+        will be returned.
+    """
+
+
+class InternVLVisionPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values)
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class InternVLVisionEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+
+    """
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        else:
+            self.mask_token = None
+        self.patch_embeddings = InternVLVisionPatchEmbeddings(config)
+        self.patch_size = config.patch_size
+        self.image_size = (
+            config.image_size
+            if isinstance(config.image_size, collections.abc.Iterable)
+            else (config.image_size, config.image_size)
+        )
+        num_patches = self.patch_embeddings.num_patches
+        if config.use_absolute_position_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        else:
+            self.position_embeddings = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternVLVisionMLP(CLIPMLP):
+    pass
+
+
+NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm}
+
+
+class InternVLVisionLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InternVLVisionAttention(config)
+        self.mlp = InternVLVisionMLP(config)
+        # InternVL uses different layernorm implementations for different models
+        self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+
+        init_values = config.layer_scale_init_value
+        self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
+        attention_output, attention_weights = self.attention(
+            self.layernorm_before(hidden_states),  # in InternVLVision, layernorm is applied before self-attention
+            output_attentions=output_attentions,
+        )
+
+        attention_output = self.lambda_1 * attention_output
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in InternVLVision, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+
+        layer_output = self.mlp(layer_output)
+        layer_output = self.dropout(layer_output)
+
+        if self.lambda_2 is not None:
+            layer_output = self.lambda_2 * layer_output
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        return layer_output, attention_weights
+
+
+class InternVLVisionEncoder(nn.Module):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@auto_docstring
+class InternVLVisionModel(InternVLVisionPreTrainedModel):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVLVisionEmbeddings(config)
+        self.encoder = InternVLVisionEncoder(config)
+
+        self.layernorm = (
+            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Union[tuple, InternVLVisionModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        return InternVLVisionModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class InternVLPreTrainedModel(LlavaPreTrainedModel):
+    pass
+
+
+INTERNVL_INPUTS_DOCSTRING = None
+
+
+class InternVLMultiModalProjector(nn.Module):
+    def __init__(self, config: InternVLConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.layer_norm(image_features)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class InternVLModelOutputWithPast(LlavaModelOutputWithPast):
+    pass
+
+
+class InternVLModel(LlavaModel):
+    def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5):
+        """Perform pixel shuffle downsampling on vision features.
+
+        Args:
+            vision_features (`torch.Tensor`):
+                Input tensor of shape (batch_size, width, height, channels).
+            scale_factor (`float`, *optional*, defaults to `0.5`):
+                Factor by which to downsample. Default is 0.5, which halves the dimensions.
+
+        Returns:
+            vision_features (`torch.Tensor`):
+                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
+        """
+        batch_size, width, height, channels = vision_features.size()
+
+        if height % scale_factor != 0 or width % scale_factor != 0:
+            raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.")
+
+        # Reshape to allow downsampling
+        vision_features = vision_features.view(
+            batch_size, width, int(height * scale_factor), int(channels / scale_factor)
+        )
+        # Permute dimensions to align downsampled axis correctly
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        # Reshape to achieve final downsampled dimensions
+        vision_features = vision_features.view(
+            batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2))
+        )
+
+        # Swap height and width back for proper orientation
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        return vision_features
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+            vision_feature_layer (`int` or `list[int]`):
+                Layer index or list of layer indices to extract features from.
+        Returns:
+            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+
+        downsample_ratio = self.config.downsample_ratio
+        if vision_feature_layer == -1:
+            vision_features = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        else:
+            vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            vision_features = vision_features[:, 1:, :]
+
+        # Calculate dimensions based on vision features
+        channels = vision_features.shape[1]
+        feature_size = int(channels**0.5)
+        batch_size = vision_features.shape[0]
+
+        # Reshape tensor to spatial dimensions
+        vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1)
+
+        # Apply downsampling using pixel shuffle
+        vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio)
+
+        # Reshape tensor to prepare for projection
+        vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1])
+
+        # Project features through multi-modal projector
+        vision_features = self.multi_modal_projector(vision_features)
+        return vision_features
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, InternVLModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return InternVLModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class InternVLCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    pass
+
+
+class InternVLForConditionalGeneration(LlavaForConditionalGeneration):
+    def forward(**super_kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+        >>> torch_device = "cuda"
+        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
+        >>> model = AutoModelForImageTextToText.from_pretrained(
+        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
+        ... )
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+        ...             },
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+        ...             },
+        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
+        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
+        The images depict the Statue of Liberty and the Golden Gate Bridge.
+        ```"""
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "InternVLVisionPreTrainedModel",
+    "InternVLVisionModel",
+    "InternVLPreTrainedModel",
+    "InternVLModel",
+    "InternVLForConditionalGeneration",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/processing_internvl.py b/phivenv/Lib/site-packages/transformers/models/internvl/processing_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..179dccb63eb3edee5412d879660ddef78d5c6d61
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/internvl/processing_internvl.py
@@ -0,0 +1,304 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...video_utils import VideoInput
+
+
+class InternVLImagesKwargs(ImagesKwargs, total=False):
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+class InternVLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: InternVLImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding_side": "left",
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {
+            "crop_to_patches": True,
+        },
+        "videos_kwargs": {},
+    }
+
+
+class InternVLProcessor(ProcessorMixin):
+    r"""
+    Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`AutoImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`AutoVideoProcessor`], *optional*):
+            The video processor is a required input.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            The number of image token to use per image patch. it should be set so that:
+            image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        image_seq_length: int = 256,
+        chat_template=None,
+        **kwargs,
+    ):
+        self.image_seq_length = image_seq_length
+        self.start_image_token = tokenizer.start_image_token
+        self.end_image_token = tokenizer.end_image_token
+        self.start_image_token_id = tokenizer.start_image_token_id
+        self.end_image_token_id = tokenizer.end_image_token_id
+        self.image_token = tokenizer.context_image_token
+        self.video_token = tokenizer.video_token
+        self.image_token_id = tokenizer.context_image_token_id
+        self.image_ids = [self.image_token_id, self.start_image_token_id, self.end_image_token_id]
+
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs)
+
+    def _insert_media_placeholders(
+        self,
+        text: list[str],
+        image_pixel_values,
+        video_pixel_values,
+        image_num_patches: list[int],
+        video_num_patches: list[int],
+        image_num_patches_indices: np.ndarray,
+        video_num_patches_indices: np.ndarray,
+        video_patch_indices: np.ndarray,
+    ):
+        """
+        Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
+        image and video tokens while keeping track of the patches used.
+        """
+        image_index = 0
+        video_index = 0
+        processed_text = []
+        image_video_patches = []
+        replace_strings = []
+        # Support interleaved image and video in prompts:
+        # Processed patches of images and videos are inserted in `image_video_patches` in the order they appear in the prompts
+        for prompt in text:
+            new_prompt = prompt
+            while self.image_token in new_prompt or self.video_token in new_prompt:
+                if self.image_token in new_prompt and (
+                    self.video_token not in new_prompt
+                    or new_prompt.index(self.image_token) < new_prompt.index(self.video_token)
+                ):
+                    # Get the slice of patches corresponding to the current image
+                    start_index = image_num_patches_indices[image_index - 1] if image_index > 0 else 0
+                    end_index = image_num_patches_indices[image_index]
+                    image_video_patches.append(image_pixel_values[start_index:end_index])
+                    # Replace the corresponding image placeholder with the correct number of image tokens
+                    new_prompt = new_prompt.replace(self.image_token, "<placeholder>", 1)
+                    replace_strings.append(
+                        f"{self.start_image_token}{self.image_token * self.image_seq_length * image_num_patches[image_index]}{self.end_image_token}"
+                    )
+                    image_index += 1
+                else:
+                    # Get the slice of patches corresponding to the current video
+                    # Here we need to account for both the multiple video frames and the potential multiple patches per frame
+                    # As of now, InternVL only supports one patch per frame, but we keep the code flexible for future updates
+                    current_patch_index = video_patch_indices[video_index - 1] if video_index > 0 else 0
+                    end_patch_index = video_patch_indices[video_index]
+                    start_index = video_num_patches_indices[current_patch_index] if video_index > 0 else 0
+                    end_index = video_num_patches_indices[end_patch_index - 1]
+                    image_video_patches.append(video_pixel_values[start_index:end_index])
+                    # Get the number of patches per frame and replace the video placeholder with the correct number of image tokens
+                    num_patches = list(video_num_patches[current_patch_index:end_patch_index])
+                    video_prompt = "\n".join(
+                        f"Frame{i + 1}: {self.start_image_token}{self.image_token * self.image_seq_length * num_patches[i]}{self.end_image_token}"
+                        for i in range(len(num_patches))
+                    )
+                    replace_strings.append(video_prompt)
+                    new_prompt = new_prompt.replace(self.video_token, "<placeholder>", 1)
+                    video_index += 1
+            while "<placeholder>" in new_prompt:
+                replace_str = replace_strings.pop(0)
+                new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
+            processed_text.append(new_prompt)
+
+        return processed_text, image_video_patches, image_index, video_index
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audio=None,
+        videos: Optional[VideoInput] = None,
+        **kwargs: Unpack[InternVLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
+        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
+        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
+        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None:
+            raise ValueError("You have to specify text.")
+
+        output_kwargs = self._merge_kwargs(
+            InternVLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if not isinstance(text, (list, tuple)):
+            text = [text]
+
+        # Process images and videos separately, as videos don't support crop_to_patches
+        image_num_patches = []
+        video_num_patches = []
+        image_videos_inputs = {}
+        image_pixel_values = None
+        video_pixel_values = None
+        image_num_patches_indices = np.array([0])
+        video_patch_indices = np.array([0])
+        video_num_patches_indices = np.array([0])
+        if images is not None:
+            images = self.image_processor.fetch_images(images)
+            images = make_flat_list_of_images(images)
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_num_patches = image_inputs.pop("num_patches")
+            image_pixel_values = image_inputs.pop("pixel_values")
+            image_num_patches_indices = np.cumsum(image_num_patches)
+        if videos is not None:
+            video_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_pixel_values = video_inputs.pop("pixel_values_videos")
+
+            # Obtain per frame information first and then flatten to (BS * T, ...)
+            num_frames_per_video = [len(video) for video in video_pixel_values]
+            video_num_patches = [1 for frames in num_frames_per_video for _ in range(frames)]
+            video_patch_indices = np.cumsum(num_frames_per_video)
+            video_num_patches_indices = np.cumsum(video_num_patches)
+            video_pixel_values = video_pixel_values.flatten(0, 1)
+
+        if images is not None or videos is not None:
+            text, image_video_patches, image_index, video_index = self._insert_media_placeholders(
+                text,
+                image_pixel_values,
+                video_pixel_values,
+                image_num_patches,
+                video_num_patches,
+                image_num_patches_indices,
+                video_num_patches_indices,
+                video_patch_indices,
+            )
+            if images is not None and image_index != len(images):
+                raise ValueError("Number of image placeholders in the prompt does not match the number of images.")
+            if videos is not None and video_index != len(num_frames_per_video):
+                raise ValueError("Number of video placeholders in the prompt does not match the number of videos.")
+
+            # Concatenate the interleaved image and video patches (function agnostic to the patches type (list, numpy array, torch tensor))
+            image_videos_inputs = {"pixel_values": concatenate_list(image_video_patches)}
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_videos_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = InternVLProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            # Add 2 for BOI and EOI tokens
+            num_image_tokens = [2 + (self.image_seq_length * num_patches) for num_patches in num_image_patches]
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    @property
+    def model_input_names(self):
+        # Overwritten because InternVL renames video inputs to `pixel_values` before returning
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return tokenizer_input_names + image_processor_input_names
+
+
+__all__ = ["InternVLProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/internvl/video_processing_internvl.py b/phivenv/Lib/site-packages/transformers/models/internvl/video_processing_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc5729119e96e99e90e0130a563670be35739eb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/internvl/video_processing_internvl.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Video processor class for InternVL."""
+
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    SizeDict,
+)
+from ...processing_utils import Unpack, VideosKwargs
+from ...utils import (
+    TensorType,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+)
+from ...utils.import_utils import requires
+from ...video_processing_utils import BaseVideoProcessor
+from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
+
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from ...image_utils import PILImageResampling
+
+
+class InternVLVideoProcessorInitKwargs(VideosKwargs):
+    initial_shift: Union[bool, float, int]
+
+
+@requires(backends=("torchvision",))
+class InternVLVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    initial_shift = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
+    valid_kwargs = InternVLVideoProcessorInitKwargs
+    model_input_names = ["pixel_values_videos"]
+
+    def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        num_frames: Optional[int] = None,
+        fps: Optional[Union[int, float]] = None,
+        initial_shift: Optional[Union[bool, float, int]] = None,
+        **kwargs,
+    ):
+        """
+        Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
+        If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
+        and `fps` are mutually exclusive.
+
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            fps (`int` or `float`, *optional*):
+                Target frames to sample per second. Defaults to `self.fps`.
+            initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
+                The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
+
+        Returns:
+            np.ndarray:
+                Indices to sample video frames.
+        """
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        initial_shift = initial_shift if initial_shift is not None else self.initial_shift
+        total_num_frames = metadata.total_num_frames
+
+        # If num_frames is not given but fps is, calculate num_frames from fps
+        if num_frames is None and fps is not None:
+            if metadata is None or metadata.fps is None:
+                raise ValueError(
+                    "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
+                    "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
+                )
+            num_frames = int(total_num_frames / metadata.fps * fps)
+
+        if initial_shift is True:
+            initial_shift = total_num_frames / num_frames / 2
+
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"Video can't be sampled. The `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
+            )
+
+        indices = torch.arange(initial_shift, total_num_frames, total_num_frames / num_frames).int()
+        return indices
+
+    def _preprocess(
+        self,
+        videos: list["torch.Tensor"],
+        do_convert_rgb: bool,
+        do_resize: bool,
+        size: SizeDict,
+        size_divisor: Optional[int],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        do_pad: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group videos by size for batched resizing
+        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
+        resized_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_convert_rgb:
+                stacked_videos = self.convert_to_rgb(stacked_videos)
+            if do_resize:
+                stacked_videos = self.resize(
+                    stacked_videos, size=size, size_divisor=size_divisor, interpolation=interpolation
+                )
+            resized_videos_grouped[shape] = stacked_videos
+        resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
+
+        # Group videos by size for further processing
+        # Needed in case do_resize is False, or resize returns videos with different sizes
+        grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
+        processed_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_center_crop:
+                stacked_videos = self.center_crop(stacked_videos, crop_size)
+            # Fused rescale and normalize
+            stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_videos_grouped[shape] = stacked_videos
+
+        processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
+        processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
+
+        return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
+
+
+__all__ = ["InternVLVideoProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/jamba/__init__.py b/phivenv/Lib/site-packages/transformers/models/jamba/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8789007ad0bc8f2a3ee1318370323a054474857b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/jamba/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_jamba import *
+    from .modeling_jamba import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dacd6f260ed4fe9dd354daed9e4e9e0dc6d01ea0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/configuration_jamba.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/configuration_jamba.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cb71e243c885d52220756310dffb031a5cac95a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/configuration_jamba.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/modeling_jamba.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/modeling_jamba.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a24359728725cd1154f02776d8a6fa684ea7196
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/jamba/__pycache__/modeling_jamba.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/jamba/configuration_jamba.py b/phivenv/Lib/site-packages/transformers/models/jamba/configuration_jamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..a557562ea01887bba88af450715914797e0ee34b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/jamba/configuration_jamba.py
@@ -0,0 +1,236 @@
+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Jamba model configuration"""
+
+import math
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class JambaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JambaModel`]. It is used to instantiate a
+    Jamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Jamba-v0.1 model.
+
+    [ai21labs/Jamba-v0.1](https://huggingface.co/ai21labs/Jamba-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the Jamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`JambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        max_position_embeddings (`int`, *optional*, defaults to 262144):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        expert_layer_period (`int`, *optional*, defaults to 2):
+            Once in this many layers, we will have an expert layer
+        expert_layer_offset (`int`, *optional*, defaults to 1):
+            The first layer index that contains an expert mlp layer
+        attn_layer_period (`int`, *optional*, defaults to 8):
+            Once in this many layers, we will have a vanilla attention layer
+        attn_layer_offset (`int`, *optional*, defaults to 4):
+            The first layer index that contains a vanilla attention mlp layer
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+
+    """
+
+    model_type = "jamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65536,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        num_logits_to_keep=1,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=262144,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_experts=16,
+        expert_layer_period=2,
+        expert_layer_offset=1,
+        attn_layer_period=8,
+        attn_layer_offset=4,
+        use_mamba_kernels=True,
+        mamba_d_state=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_dt_rank="auto",
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.expert_layer_period = expert_layer_period
+        self.expert_layer_offset = expert_layer_offset
+        self.attn_layer_period = attn_layer_period
+        self.attn_layer_offset = attn_layer_offset
+
+        self._check_supported_offset("attention", self.attn_layer_period, self.attn_layer_offset)
+        self._check_supported_offset("expert", self.expert_layer_period, self.expert_layer_offset)
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "attention" if i % self.attn_layer_period == self.attn_layer_offset else "mamba"
+            for i in range(self.num_hidden_layers)
+        ]
+
+    @property
+    def layers_num_experts(self):
+        return [
+            self.num_experts if i % self.expert_layer_period == self.expert_layer_offset else 1
+            for i in range(self.num_hidden_layers)
+        ]
+
+    def _check_supported_offset(self, property_: str, period: int, offset: int):
+        if offset >= period:
+            raise ValueError(
+                f"{property_} layer offset ({offset}) must be smaller than {property_} layer period ({period})"
+            )
+
+
+__all__ = ["JambaConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/jamba/modeling_jamba.py b/phivenv/Lib/site-packages/transformers/models/jamba/modeling_jamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..db7461af73d77aead7f62bdc66a0a48d62504e99
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/jamba/modeling_jamba.py
@@ -0,0 +1,1457 @@
+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Jamba model."""
+
+import math
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
+from .configuration_jamba import JambaConfig
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+else:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+is_fast_path_available = all(
+    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router
+def load_balancing_loss_func(
+    router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        router_logits:
+            Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if router_logits is None or not isinstance(router_logits, tuple):
+        return 0
+
+    if isinstance(router_logits, tuple):
+        compute_device = router_logits[0].device
+        concatenated_router_logits = torch.cat(
+            [layer_router.to(compute_device) for layer_router in router_logits], dim=0
+        )
+
+    routing_weights = torch.nn.functional.softmax(concatenated_router_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_router_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, routing_weights.shape[1]))
+            .reshape(-1, routing_weights.shape[1])
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
+    overall_loss = torch.sum(
+        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
+    )
+    return overall_loss * num_experts
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Jamba
+class JambaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        JambaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class HybridMambaAttentionDynamicCache:
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    is_compileable = False
+
+    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False  # only used by mamba
+        intermediate_size = config.mamba_expand * config.hidden_size
+        ssm_state_size = config.mamba_d_state
+        conv_kernel_size = config.mamba_d_conv
+        self.conv_states = []
+        self.ssm_states = []
+        self.transformer_layers = []
+        for i in range(config.num_hidden_layers):
+            if self.layers_block_type[i] == "mamba":
+                self.conv_states += [
+                    torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
+                ]
+                self.ssm_states += [
+                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
+                ]
+            else:
+                self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
+                self.ssm_states += [torch.tensor([[]] * batch_size, device=device)]
+                self.transformer_layers.append(i)
+
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Jamba
+class JambaAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: JambaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_values
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Jamba
+class JambaFlashAttention2(JambaAttention):
+    """
+    Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_values
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Jamba
+class JambaSdpaAttention(JambaAttention):
+    """
+    Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from JambaAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "JambaModel is using JambaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = self.is_causal and causal_mask is None and q_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_values
+
+
+JAMBA_ATTENTION_CLASSES = {
+    "eager": JambaAttention,
+    "flash_attention_2": JambaFlashAttention2,
+    "sdpa": JambaSdpaAttention,
+}
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class JambaMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: JambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.time_step_rank = config.mamba_dt_rank
+        self.use_conv_bias = config.mamba_conv_bias
+        self.use_bias = config.mamba_proj_bias
+        self.conv1d = nn.Conv1d(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias=self.use_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.intermediate_size,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+
+        self.use_fast_kernels = config.use_mamba_kernels
+
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=self.use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
+        # time step projection (discretization)
+        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.ssm_state_size + 1)[None, :]
+        A = A.expand(self.intermediate_size, -1).contiguous()
+
+        self.A_log = nn.Parameter(torch.log(A))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size))
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+
+        self.dt_layernorm = JambaRMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
+        self.b_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.c_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                " is None. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config"
+            )
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: HybridMambaAttentionDynamicCache = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        batch_size, seq_len, _ = hidden_states.shape
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+        )
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+
+        # We can't use `mamba_inner_fn` even if in training and without cache params because we have the
+        # inner layernorms which isn't supported by this fused kernel
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+        if use_precomputed_states:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_states[self.layer_idx],
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+        else:
+            if cache_params is not None:
+                conv_states = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+            hidden_states = causal_conv1d_fn(hidden_states, conv_weights, self.conv1d.bias, activation=self.activation)
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+
+        time_step = self.dt_layernorm(time_step)
+        B = self.b_layernorm(B)
+        C = self.c_layernorm(C)
+
+        # Here we need to apply dt_proj without the bias, as the bias is added in the selective scan kernel.
+        # This is a hack to apply dt_proj while still using the forward pass of `torch.nn.Linear`, which is needed
+        # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
+        # linear layers, and requires to call the forward pass directly.
+        # Quantized model can't work with the original code:
+        # ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
+        time_proj_bias = self.dt_proj.bias.data
+        with torch.no_grad():
+            self.dt_proj.bias.data = torch.zeros_like(self.dt_proj.bias.data)
+        discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
+        with torch.no_grad():
+            self.dt_proj.bias.data = time_proj_bias
+
+        A = -torch.exp(self.A_log.float())
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
+        if use_precomputed_states:
+            scan_outputs = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states[..., 0],
+                discrete_time_step[..., 0],
+                A,
+                B[:, 0],
+                C[:, 0],
+                self.D,
+                gate[..., 0],
+                time_proj_bias,
+                dt_softplus=True,
+            ).unsqueeze(-1)
+        else:
+            scan_outputs, ssm_state = selective_scan_fn(
+                hidden_states,
+                discrete_time_step,
+                A,
+                B.transpose(1, 2),
+                C.transpose(1, 2),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                return_last_state=True,
+            )
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+
+        return contextualized_states
+
+    # fmt: off
+    def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask: Optional[torch.LongTensor] = None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(input_states).transpose(1, 2)                   # [batch, 2 * intermediate_size, seq_len]
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        use_cache = isinstance(cache_params, HybridMambaAttentionDynamicCache)
+        # 2. Convolution sequence transformation
+        if use_cache and cache_params.ssm_states[self.layer_idx].shape[0] == batch_size:
+            if self.training:
+                # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass
+                ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            else:
+                ssm_state = cache_params.ssm_states[self.layer_idx]
+
+            ssm_state = ssm_state.to(hidden_states.device)
+
+            if cache_params.has_previous_state and seq_len == 1 and \
+                    cache_params.conv_states[self.layer_idx].shape[0] == batch_size:
+                conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                conv_state[:, :, -1] = hidden_states[:, :, 0]
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)         # [batch, intermediate_size, 1] : decoding
+            else:
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.intermediate_size, self.ssm_state_size),
+                device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])         # [batch, intermediate_size, seq_len]
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+
+        time_step = self.dt_layernorm(time_step)
+        B = self.b_layernorm(B)
+        C = self.c_layernorm(C)
+
+        discrete_time_step = self.dt_proj(time_step)                                    # [batch, seq_len, intermediate_size]
+        discrete_time_step = nn.functional.softplus(discrete_time_step).transpose(1, 2) # [batch, intermediate_size, seq_len]
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -torch.exp(self.A_log.float())                                              # [intermediate_size, ssm_state_size]
+        discrete_A = torch.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None]) # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediate_size, seq_len, ssm_state_size]
+        deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        scan_outputs = []
+        for i in range(seq_len):
+            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediate_size, ssm_state]
+            scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediate_size, 1]
+            scan_outputs.append(scan_output[:, :, 0])
+        scan_output = torch.stack(scan_outputs, dim=-1)                                # [batch, intermediate_size, seq_len]
+        scan_output = scan_output + (hidden_states * self.D[None, :, None])
+        scan_output = (scan_output * self.act(gate))
+
+        if use_cache:
+            cache_params.ssm_states[self.layer_idx] = ssm_state
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.transpose(1, 2))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: HybridMambaAttentionDynamicCache = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        if self.use_fast_kernels:
+            if not is_fast_path_available or "cuda" not in self.x_proj.weight.device.type:
+                raise ValueError(
+                    "Fast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device"
+                )
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask)
+        return self.slow_forward(hidden_states, cache_params, attention_mask)
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Jamba
+class JambaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+# Adapted from transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock with Mistral->Jamba
+class JambaSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accommodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+
+    def __init__(self, config: JambaConfig):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+
+        self.router = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([JambaMLP(config) for _ in range(self.num_experts)])
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.router(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class JambaAttentionDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: JambaConfig, layer_idx: int):
+        super().__init__()
+        num_experts = config.layers_num_experts[layer_idx]
+        self.self_attn = JAMBA_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        ffn_layer_class = JambaSparseMoeBlock if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config)
+        self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        # residual connection after attention
+        hidden_states = residual + hidden_states
+
+        # feed-forward (experts/MLP)
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        ff_outputs = self.feed_forward(hidden_states)
+        if isinstance(ff_outputs, tuple):
+            hidden_states, router_logits = ff_outputs
+        else:
+            hidden_states, router_logits = ff_outputs, None
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class JambaMambaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: JambaConfig, layer_idx: int):
+        super().__init__()
+        num_experts = config.layers_num_experts[layer_idx]
+        self.mamba = JambaMambaMixer(config=config, layer_idx=layer_idx)
+
+        ffn_layer_class = JambaSparseMoeBlock if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config)
+        self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_values,
+            attention_mask=attention_mask,
+        )
+        self_attn_weights = None
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        # feed-forward (experts/MLP)
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        ff_outputs = self.feed_forward(hidden_states)
+        if isinstance(ff_outputs, tuple):
+            hidden_states, router_logits = ff_outputs
+        else:
+            hidden_states, router_logits = ff_outputs, None
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_values,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+@auto_docstring
+class JambaPreTrainedModel(PreTrainedModel):
+    config: JambaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["JambaAttentionDecoderLayer", "JambaMambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    # Note: only supports HybridMambaAttentionDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, JambaRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, JambaMambaMixer):
+            A = torch.arange(1, module.ssm_state_size + 1)[None, :]
+            A = A.expand(module.intermediate_size, -1).contiguous()
+            module.A_log.data.copy_(torch.log(A))
+            module.D.data.fill_(1.0)
+
+
+ALL_DECODER_LAYER_TYPES = {"attention": JambaAttentionDecoderLayer, "mamba": JambaMambaDecoderLayer}
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->JAMBA, Mistral->Jamba
+@auto_docstring
+class JambaModel(JambaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`JambaDecoderLayer`]
+
+    Args:
+        config: JambaConfig
+    """
+
+    def __init__(self, config: JambaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]]
+            decoder_layers.append(layer_class(config, layer_idx=i))
+        self.layers = nn.ModuleList(decoder_layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "Jamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention)
+            layer_mask = mamba_mask if isinstance(decoder_layer, JambaMambaDecoderLayer) else causal_mask
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=layer_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                output_router_logits=output_router_logits,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                if layer_outputs[-1] is not None:
+                    # append router logits only of expert layers. Regular MLP layers return `None` as the router logits
+                    all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        next_cache = None if not use_cache else past_key_values
+
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    def _update_mamba_mask(self, attention_mask, cache_position):
+        """
+        No need for zeroing states when
+            1. Cached forward
+            2. Attending to all inputs
+        """
+        mamba_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            mamba_mask = None
+        return mamba_mask
+
+
+# Adapted from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM with MIXTRAL->JAMBA, Mixtral->Jamba
+class JambaForCausalLM(JambaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: JambaConfig):
+        super().__init__(config)
+        self.model = JambaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, JambaForCausalLM
+
+        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_router_logits=False,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config, input_ids.shape[0], self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "output_router_logits": output_router_logits,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+class JambaForSequenceClassification(GenericForSequenceClassification, JambaPreTrainedModel): ...
+
+
+__all__ = ["JambaForCausalLM", "JambaForSequenceClassification", "JambaModel", "JambaPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__init__.py b/phivenv/Lib/site-packages/transformers/models/janus/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aacc2ed6fdbc936a0ed5b84c5833335a442793d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/janus/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_janus import *
+    from .image_processing_janus import *
+    from .image_processing_janus_fast import *
+    from .modeling_janus import *
+    from .processing_janus import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd1ae3a2b27f8ac65544bf11324b54762e7264ef
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/configuration_janus.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/configuration_janus.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55b28b36e6fc26330392261658f63049660a89c9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/configuration_janus.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/image_processing_janus.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/image_processing_janus.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99077d4c6b71305b636b6238e6ebfefb579b3c85
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/image_processing_janus.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/image_processing_janus_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/image_processing_janus_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..942254ae1a27dffec75699e8379451d5826a83ee
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/image_processing_janus_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/modeling_janus.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/modeling_janus.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a24de7bfabb18aa79f5d5d5b4f09c49b6fdcde0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/modeling_janus.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/modular_janus.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/modular_janus.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ceff85c7f8cdaff448230b243e66960291ede739
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/modular_janus.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/processing_janus.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/processing_janus.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..267815e40d2f653407a47a622d5d7b0f544fad61
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/janus/__pycache__/processing_janus.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/configuration_janus.py b/phivenv/Lib/site-packages/transformers/models/janus/configuration_janus.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0e758fd9ed471089065e8502f99d084a5a09b57
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/janus/configuration_janus.py
@@ -0,0 +1,322 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/janus/modular_janus.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_janus.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class JanusVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
+    `JanusVisionModel` according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 384):
+            The size (resolution) of each image.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, and `"gelu_new"` are supported.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys, and values in the attention layers.
+        hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for fully connected layers in the encoder.
+        projection_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the MLP projection head.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the projection layer.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the query and key matrices.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal initializer for initializing all weight matrices.
+        depth (`int`, *optional*, defaults to 2):
+            Number of hidden layers in the aligner module.
+        num_image_tokens (`int`, *optional*, defaults to 576):
+            Number of image tokens.
+    """
+
+    model_type = "janus_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        patch_size=16,
+        image_size=384,
+        attention_dropout=0.0,
+        layer_norm_eps=1e-6,
+        hidden_act="gelu",
+        mlp_ratio=4.0,
+        attention_bias=True,
+        hidden_dropout_rate=0.0,
+        projection_dim=2048,
+        projection_dropout=0.0,
+        use_qk_norm=False,
+        initializer_range=0.02,
+        depth=2,
+        num_image_tokens=576,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.mlp_ratio = mlp_ratio
+        self.attention_bias = attention_bias
+        self.hidden_dropout_rate = hidden_dropout_rate
+        self.projection_dim = projection_dim
+        self.projection_dropout = projection_dropout
+        self.use_qk_norm = use_qk_norm
+        self.initializer_range = initializer_range
+        self.depth = depth
+        self.num_image_tokens = num_image_tokens
+
+
+class JanusVQVAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
+    `JanusVQVAEModel` according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a
+    configuration with the defaults will yield a similar configuration to the VQModel of the
+    [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).
+
+    Args:
+        embed_dim (`int`, *optional*, defaults to 8):
+            Dimensionality of each embedding vector.
+        num_embeddings (`int`, *optional*, defaults to 16384):
+            Number of codebook embeddings.
+        double_latent (`bool`, *optional*, defaults to `False`):
+            Whether to use double z channels.
+        latent_channels (`int`, *optional*, defaults to 256):
+            Number of channels for the latent space.
+        num_patches (`int`, *optional*, defaults to 32):
+            Num of patches the input images can be divided into.
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            Number of out channels.
+        base_channels (`int`, *optional*, defaults to 128):
+            Base channel count.
+        channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
+            Channel multipliers for each resolution.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Number of residual blocks.
+        dropout (`float`, *optional*, defaults to 0.0):
+            Dropout rate.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        projection_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the MLP projection head.
+        num_hidden_layers (`int`, *optional*, defaults to 2):
+            Number of hidden layers in VAVAE MLP Connecter module.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        image_token_embed_dim (`int`, *optional*, defaults to 2048):
+            Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
+    """
+
+    model_type = "janus_vqgan"
+    base_config_key = "vq_config"
+
+    def __init__(
+        self,
+        embed_dim: int = 8,
+        num_embeddings: int = 16384,
+        double_latent: bool = False,
+        latent_channels: int = 256,
+        num_patches: int = 32,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        base_channels: int = 128,
+        channel_multiplier: list[int] = [1, 1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        dropout: float = 0.0,
+        initializer_range=0.02,
+        projection_dim=2048,
+        num_hidden_layers=2,
+        hidden_act="gelu",
+        image_token_embed_dim=2048,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.num_embeddings = num_embeddings
+        self.double_latent = double_latent
+        self.latent_channels = latent_channels
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.channel_multiplier = channel_multiplier
+        self.num_res_blocks = num_res_blocks
+        self.dropout = dropout
+        self.initializer_range = initializer_range
+        self.num_patches = num_patches
+        self.out_channels = out_channels
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_act = hidden_act
+        self.image_token_embed_dim = image_token_embed_dim
+
+
+class JanusConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
+    Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.
+
+    e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
+    [deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
+            The config object or dictionary of the VQVAE backbone.
+        image_token_id (`int`, *optional*, defaults to 100581):
+            Token index of a placeholder image token.
+
+    Example:
+
+    ```python
+    >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig
+
+    >>> # Initializing a Janus vision config
+    >>> vision_config = JanusVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a VQ config
+    >>> vq_config = JanusVQVAEConfig()
+
+    >>> # Initializing a Janus Pro 1B style configuration
+    >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)
+
+    >>> # Initializing a model from the Janus Pro 1B style configuration
+    >>> model = JanusForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "janus"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "vision_config": JanusVisionConfig,
+        "vq_config": JanusVQVAEConfig,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        vq_config=None,
+        image_token_id=100581,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+        elif text_config is None:
+            logger.info("`text_config` is None. Initializing with default values")
+            self.text_config = CONFIG_MAPPING["llama"]()
+        elif isinstance(text_config, PretrainedConfig):
+            self.text_config = text_config
+        else:
+            raise ValueError(
+                f"Invalid type for `text_config`. Must be either `dict` or `LlamaConfig`."
+                f" Type found: {type(text_config)}"
+            )
+
+        if vision_config is None:
+            logger.info("`vision_config` is None. Initializing with default JanusVisionConfig values")
+            self.vision_config = JanusVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = JanusVisionConfig(**vision_config)
+        elif isinstance(vision_config, JanusVisionConfig):
+            self.vision_config = vision_config
+        else:
+            raise ValueError(
+                f"Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`."
+                f" Type found: {type(vision_config)}"
+            )
+
+        if vq_config is None:
+            logger.info("`vq_config` is None. Initializing with default JanusVQVAEConfig values")
+            self.vq_config = JanusVQVAEConfig()
+        elif isinstance(vq_config, dict):
+            self.vq_config = JanusVQVAEConfig(**vq_config)
+        elif isinstance(vq_config, JanusVQVAEConfig):
+            self.vq_config = vq_config
+        else:
+            raise ValueError(
+                f"Invalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`."
+                f" Type found: {type(vq_config)}"
+            )
+
+        self.initializer_range = self.vision_config.initializer_range
+        # This dimension is required when decoding discrete image tokens to continuous input.
+        self.vq_config.num_patches = self.vision_config.image_size // self.vision_config.patch_size
+        # The default is only the index for the 1B model, 7B uses a different one
+        self.image_token_id = image_token_id
+        super().__init__(**kwargs)
+
+
+__all__ = ["JanusVQVAEConfig", "JanusVisionConfig", "JanusConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/image_processing_janus.py b/phivenv/Lib/site-packages/transformers/models/janus/image_processing_janus.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac2012c62b041863d67c1d320cc2ad304a46694f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/janus/image_processing_janus.py
@@ -0,0 +1,516 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/janus/modular_janus.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_janus.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_vision_available,
+    logging,
+)
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class JanusImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a JANUS image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        min_size (`int`, *optional*, defaults to 14):
+            The minimum allowed size for the resized image. Ensures that neither the height nor width
+            falls below this value after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        min_size: int = 14,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+
+        self.min_size = min_size
+        if image_mean is None:
+            self.background_color = (127, 127, 127)
+        else:
+            self.background_color = tuple(int(x * 255) for x in image_mean)
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Union[dict[str, int], int],
+        background_color: Optional[tuple[int, int, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to dynamically calculated size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]` or `int`):
+                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
+            background_color (`tuple[int, int, int]`):
+                The background color to use for the padding.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `None`: will be inferred from input
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        background_color = background_color if background_color is not None else self.background_color
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        height, width = get_image_size(image, input_data_format)
+        max_size = max(height, width)
+
+        size = get_size_dict(size, default_to_square=True)
+        if size["height"] != size["width"]:
+            raise ValueError(
+                f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+            )
+        size = size["height"]
+
+        delta = size / max_size
+        # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+        output_size_nonpadded = [
+            max(int(height * delta), self.min_size),
+            max(int(width * delta), self.min_size),
+        ]
+
+        image = resize(
+            image,
+            size=output_size_nonpadded,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        # Expand and pad the images to obtain a square image of dimensions `size x size`
+        image = self.pad_to_square(
+            image=image,
+            background_color=background_color,
+            input_data_format=input_data_format,
+        )
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        return encoded_outputs
+
+    def pad_to_square(
+        self,
+        image: np.ndarray,
+        background_color: Union[int, tuple[int, int, int]] = 0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            image (`np.ndarray`):
+                The image to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The padded image.
+        """
+        height, width = get_image_size(image, input_data_format)
+        num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+        if height == width:
+            image = (
+                to_channel_dimension_format(image, data_format, input_data_format)
+                if data_format is not None
+                else image
+            )
+            return image
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        if input_data_format == ChannelDimension.FIRST:
+            result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[i, :, :] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[:, start : start + height, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, :, start : start + width] = image
+        else:
+            result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[:, :, i] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[start : start + height, :, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, start : start + width, :] = image
+
+        return result
+
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
+        input_data_format: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+    ):
+        """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)  # Ensures input is a list
+
+        if isinstance(images[0], PIL.Image.Image):
+            return images if len(images) > 1 else images[0]
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])  # Determine format dynamically
+
+        pixel_values = []
+
+        for image in images:
+            image = to_numpy_array(image)  # Ensure NumPy format
+
+            if do_normalize:
+                image = self.unnormalize(
+                    image=image, image_mean=image_mean, image_std=image_std, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+                image = image.clip(0, 255).astype(np.uint8)
+
+            if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+                image = to_channel_dimension_format(image, ChannelDimension.LAST, input_channel_dim=input_data_format)
+                image = PIL.Image.fromarray(image)
+
+            pixel_values.append(image)
+
+        data = {"pixel_values": pixel_values}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def unnormalize(
+        self,
+        image: np.array,
+        image_mean: Union[float, Iterable[float]],
+        image_std: Union[float, Iterable[float]],
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+        image = (image * image_std) + image_mean
+        Args:
+            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
+                Batch of pixel values to postprocess.
+            image_mean (`float` or `Iterable[float]`):
+                The mean to use for unnormalization.
+            image_std (`float` or `Iterable[float]`):
+                The standard deviation to use for unnormalization.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        num_channels = 3
+
+        if isinstance(image_mean, Iterable):
+            if len(image_mean) != num_channels:
+                raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(image_mean)}")
+        else:
+            image_mean = [image_mean] * num_channels
+
+        if isinstance(image_std, Iterable):
+            if len(image_std) != num_channels:
+                raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(image_std)}")
+        else:
+            image_std = [image_std] * num_channels
+
+        rev_image_mean = tuple(-mean / std for mean, std in zip(image_mean, image_std))
+        rev_image_std = tuple(1 / std for std in image_std)
+        image = self.normalize(
+            image=image, mean=rev_image_mean, std=rev_image_std, input_data_format=input_data_format
+        )
+        return image
+
+
+__all__ = ["JanusImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/image_processing_janus_fast.py b/phivenv/Lib/site-packages/transformers/models/janus/image_processing_janus_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..12f0d0f394fe4ce4251c7cfdb215a50a187e8867
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/janus/image_processing_janus_fast.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+
+
+if is_torch_available():
+    import torch
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
+@auto_docstring
+class JanusImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    min_size = 14
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    valid_kwargs = JanusFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
+        if kwargs.get("image_mean") is None:
+            background_color = (127, 127, 127)
+        else:
+            background_color = tuple(int(x * 255) for x in kwargs.get("image_mean"))
+        super().__init__(**kwargs)
+        self.background_color = tuple(background_color)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        min_size: int,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        if size.height is None or size.width is None or size.height != size.width:
+            raise ValueError(
+                f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+            )
+        size = size.height
+
+        height, width = image.shape[-2:]
+        max_size = max(height, width)
+
+        delta = size / max_size
+        # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+        output_size_nonpadded = SizeDict(
+            height=max(int(height * delta), min_size),
+            width=max(int(width * delta), min_size),
+        )
+
+        return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias)
+
+    def pad_to_square(
+        self,
+        images: "torch.Tensor",
+        background_color: Union[int, tuple[int, int, int]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+
+        Returns:
+            `torch.Tensor`: The padded images.
+        """
+        height, width = images.shape[-2:]
+        num_channels = images.shape[1]
+        batch_size = images.shape[0]
+
+        if height == width:
+            return images
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        padded_images = torch.zeros(
+            (batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device
+        )
+        for i, color in enumerate(background_color):
+            padded_images[:, i, :, :] = color
+        if width > height:
+            start = (max_dim - height) // 2
+            padded_images[:, :, start : start + height, :] = images
+        else:
+            start = (max_dim - width) // 2
+            padded_images[:, :, :, start : start + width] = images
+
+        return padded_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        min_size: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        do_pad: bool = True,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images, size=size, min_size=min_size, interpolation=interpolation
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
+        return_tensors: Optional[str] = None,
+    ) -> "torch.Tensor":
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        image_mean = tuple(-rescale_factor * mean / std for mean, std in zip(image_mean, image_std))
+        image_std = tuple(1 / std for std in image_std)
+
+        images = self.preprocess(
+            images,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=False,
+            do_pad=False,
+            return_tensors=return_tensors,
+        ).pixel_values
+        if do_rescale:
+            images = [image.clip(0, 255).to(torch.uint8) for image in images]
+
+        if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+            images = [F.to_pil_image(image) for image in images]
+
+        data = {"pixel_values": images}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["JanusImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/modeling_janus.py b/phivenv/Lib/site-packages/transformers/models/janus/modeling_janus.py
new file mode 100644
index 0000000000000000000000000000000000000000..c29c98596658315a6964d42954e9342e38f886de
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/janus/modeling_janus.py
@@ -0,0 +1,1435 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/janus/modular_janus.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_janus.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import ClassifierFreeGuidanceLogitsProcessor, GenerationMixin, GenerationMode, LogitsProcessorList
+from ...generation.utils import GenerateDecoderOnlyOutput
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torch_available,
+    logging,
+    torch_int,
+)
+from ..auto import AutoModel
+from .configuration_janus import JanusConfig, JanusVisionConfig, JanusVQVAEConfig
+
+
+if is_torch_available():
+    import torch.nn.functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class JanusPreTrainedModel(PreTrainedModel):
+    config: JanusConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"]
+    _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_param_buffer_assignment = False
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Janus VQ-VAE mode model outputs.
+    """
+)
+class JanusVQVAEOutput(ModelOutput):
+    r"""
+    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+        Reconstructed pixel values after encoding and decoding the input.
+    embedding_loss (`torch.FloatTensor`):
+        Embedding loss.
+    """
+
+    decoded_pixel_values: Optional[torch.FloatTensor] = None
+    embedding_loss: torch.FloatTensor = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    """
+)
+class JanusBaseModelOutputWithPast(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+        encoder_sequence_length, embed_size_per_head)`.
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+        input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Janus causal language model (or autoregressive) outputs.
+    """
+)
+class JanusCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class JanusVisionEmbeddings(nn.Module):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and no class embeddings.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embedding.weight.shape[0]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        if interpolate_pos_encoding:
+            pos_embeds = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            pos_embeds = self.position_embedding(self.position_ids)
+
+        embeddings = embeddings + pos_embeds
+
+        return embeddings
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class JanusVisionAttention(nn.Module):
+    """Attention Class for Janus Vision Encoder"""
+
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        proj_dropout = config.projection_dropout
+        qk_norm = config.use_qk_norm
+        self.is_causal = False
+
+        # Janus has no MHA, hence for `eager_attention_forward` call setting `num_key_value_groups` to 1.
+        self.num_key_value_groups = 1
+
+        self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim)
+        self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity()
+
+        self.q_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity()
+        self.k_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        batch_size, seq_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape(-1, self.num_heads, self.head_dim)
+        query_states = self.q_norm(query_states)
+
+        key_states = key_states.reshape(-1, self.num_heads, self.head_dim)
+        key_states = self.k_norm(key_states)
+
+        query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=self.is_causal,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
+
+        output = self.projection_layer(attn_output)
+        output = self.projection_dropout(output)
+        return output, attn_weights
+
+
+class JanusVisionMLP(nn.Module):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+        self.config = config
+        self.intermediate_size = int(config.hidden_size * config.mlp_ratio)
+        self.activation_fn = ACT2FN[config.hidden_act]  # Gelu act
+        self.fc1 = nn.Linear(config.hidden_size, self.intermediate_size)
+        self.fc2 = nn.Linear(self.intermediate_size, config.hidden_size)
+        self.dropout1 = nn.Dropout(config.hidden_dropout_rate)
+        self.dropout2 = nn.Dropout(config.hidden_dropout_rate)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.dropout1(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout2(hidden_states)
+        return hidden_states
+
+
+class JanusVisionEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = JanusVisionAttention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = JanusVisionMLP(config)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class JanusVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`JanusVisionEncoderLayer`].
+
+    Args:
+        config: JanusVisionConfig
+    """
+
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([JanusVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutput:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+@auto_docstring
+class JanusVisionModel(JanusPreTrainedModel):
+    main_input_name = "pixel_values"
+    config: JanusVisionConfig
+
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = JanusVisionEmbeddings(config)
+        self.encoder = JanusVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class JanusVisionAlignerMLP(nn.Module):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+
+        self.fc1 = nn.Linear(config.hidden_size, config.projection_dim)
+        self.hidden_layers = nn.ModuleList(
+            [nn.Linear(config.projection_dim, config.projection_dim) for _ in range(1, config.depth)]
+        )
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        for layer in self.hidden_layers:
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEVectorQuantizer(nn.Module):
+    """
+    A module for vector quantization using learned embedding vectors.
+
+    This module implements the quantization process similar to te one described in
+    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
+    input vectors into discrete codebook vectors, which are learned during training.
+    Current implementation improves over previous ones by avoiding costly matrix multiplications
+    and allowing for post-hoc remapping of indices.
+    """
+
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__()
+        self.num_embeddings = config.num_embeddings
+        self.embedding_dim = config.embed_dim
+        self.beta = getattr(config, "beta", 0.25)
+
+        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
+        self.quant_state_dims = [config.num_patches] * 2
+
+    def forward(self, hidden_state: torch.Tensor):
+        hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        distances = (
+            torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2 * torch.einsum("bd,dn->bn", hidden_state_flattened, self.embedding.weight.transpose(0, 1))
+        )
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        hidden_state_quant = self.embedding(min_encoding_indices).view(hidden_state.shape)
+
+        # compute loss for embedding
+        loss = torch.mean((hidden_state_quant.detach() - hidden_state) ** 2) + self.beta * torch.mean(
+            (hidden_state_quant - hidden_state.detach()) ** 2
+        )
+
+        # preserve gradients
+        hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach()
+
+        # reshape back to match original input shape
+        hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
+
+        return hidden_state_quant, loss, min_encoding_indices
+
+    def get_codebook_entry(self, image_tokens: torch.LongTensor) -> torch.FloatTensor:
+        batch_size = image_tokens.shape[0]
+        emb_dim: int = self.embedding.weight.shape[-1]
+
+        # get quantized latent vectors
+        hidden_state_quant = self.embedding(image_tokens)
+        # l2 normalization on the last dimension
+        hidden_state_quant = F.normalize(hidden_state_quant, p=2, dim=-1)
+
+        # reshape back to match original input shape
+        hidden_state_quant = hidden_state_quant.view((batch_size, *self.quant_state_dims, emb_dim))
+        hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
+
+        return hidden_state_quant
+
+
+class JanusVQVAEResnetBlock(nn.Module):
+    def __init__(
+        self,
+        config,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.dropout = torch.nn.Dropout(config.dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                residual = self.conv_shortcut(residual)
+            else:
+                residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class JanusVQVAEAttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        query_states = self.q(hidden_states)
+        key_states = self.k(hidden_states)
+        value_states = self.v(hidden_states)
+
+        # compute attention
+        batch_size, channels, height, width = query_states.shape
+        query_states = query_states.reshape(batch_size, channels, height * width).permute(0, 2, 1)
+        key_states = key_states.reshape(batch_size, channels, height * width)
+        attn_weights = torch.bmm(query_states, key_states)
+        attn_weights = attn_weights * (int(channels) ** (-0.5))
+        attn_weights = F.softmax(attn_weights, dim=2)
+
+        # attend to values
+        value_states = value_states.reshape(batch_size, channels, height * width)
+        attn_weights = attn_weights.permute(0, 2, 1)
+        attn_output = torch.bmm(value_states, attn_weights).reshape(batch_size, channels, height, width)
+
+        attn_output = self.proj_out(attn_output)
+        return residual + attn_output
+
+
+class JanusVQVAEConvDownsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, hidden_states):
+        # no asymmetric padding in torch conv, must do it ourselves
+        hidden_states = F.pad(hidden_states, pad=(0, 1, 0, 1), mode="constant", value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEConvUpsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_states):
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEMidBlock(nn.Module):
+    def __init__(self, config: JanusVQVAEConfig, channels: int):
+        super().__init__()
+        self.block_1 = JanusVQVAEResnetBlock(
+            config=config,
+            in_channels=channels,
+            out_channels=channels,
+        )
+        self.attn_1 = JanusVQVAEAttnBlock(channels)
+        self.block_2 = JanusVQVAEResnetBlock(
+            config=config,
+            in_channels=channels,
+            out_channels=channels,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.block_1(hidden_states)
+        hidden_states = self.attn_1(hidden_states)
+        hidden_states = self.block_2(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+
+        self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1)
+
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    JanusVQVAEResnetBlock(
+                        config=config,
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn.append(JanusVQVAEAttnBlock(block_in))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = JanusVQVAEConvDownsample(block_in)
+            self.down.append(down)
+
+        self.mid = JanusVQVAEMidBlock(config, block_in)
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * latent_channels if double_latent else latent_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, pixel_values: torch.LongTensor):
+        # downsampling
+        hidden_states = [self.conv_in(pixel_values)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                hidden_state = self.down[i_level].block[i_block](
+                    hidden_states[-1],
+                )
+                if len(self.down[i_level].attn) > 0:
+                    hidden_state = self.down[i_level].attn[i_block](hidden_state)
+                hidden_states.append(hidden_state)
+            if i_level != self.num_resolutions - 1:
+                hidden_states.append(self.down[i_level].downsample(hidden_states[-1]))
+
+        # middle
+        last_hidden_state = hidden_states[-1]
+        last_hidden_state = self.mid(last_hidden_state)
+
+        # end
+        last_hidden_state = self.norm_out(last_hidden_state)
+        last_hidden_state *= torch.sigmoid(last_hidden_state)
+        last_hidden_state = self.conv_out(last_hidden_state)
+        return last_hidden_state
+
+
+class JanusVQVAEDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        latent_channels = config.latent_channels
+        out_channels = config.out_channels
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = base_channels * config.channel_multiplier[self.num_resolutions - 1]
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(latent_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = JanusVQVAEMidBlock(config, block_in)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = base_channels * config.channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    JanusVQVAEResnetBlock(
+                        config=config,
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn.append(JanusVQVAEAttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = JanusVQVAEConvUpsample(block_in)
+            self.up.append(up)
+
+        # end
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_state: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_state = self.conv_in(hidden_state)
+
+        # middle
+        hidden_state = self.mid(hidden_state)
+
+        # upsampling
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden_state = self.up[i_level].block[i_block](hidden_state)
+                if len(self.up[i_level].attn) > 0:
+                    hidden_state = self.up[i_level].attn[i_block](hidden_state)
+            if i_level != self.num_resolutions - 1:
+                hidden_state = self.up[i_level].upsample(hidden_state)
+
+        hidden_state = self.norm_out(hidden_state)
+        hidden_state *= torch.sigmoid(hidden_state)
+        hidden_state = self.conv_out(hidden_state)
+        return hidden_state
+
+
+@auto_docstring(
+    custom_intro="""
+    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
+    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
+    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
+    Taigman](https://huggingface.co/papers/2203.13131).
+    """
+)
+class JanusVQVAE(JanusPreTrainedModel):
+    config: JanusVQVAEConfig
+    _no_split_modules = [
+        "JanusVQVAEAttnBlock",
+        "JanusVQVAEResnetBlock",
+        "JanusVQVAEVectorQuantizer",
+    ]
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__(config)
+
+        self.encoder = JanusVQVAEEncoder(config)
+        self.quantize = JanusVQVAEVectorQuantizer(config)
+        self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1)
+        self.eval()  # Janus's VQ model is frozen
+        self.decoder = JanusVQVAEDecoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize the VQVAE model.
+        self.post_init()
+
+    def encode(self, pixel_values: torch.LongTensor):
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        quant, emb_loss, indices = self.quantize(hidden_states)
+        return quant, emb_loss, indices
+
+    def decode(self, image_tokens: torch.LongTensor) -> torch.FloatTensor:
+        """
+        Decodes quantized token IDs into pixel values.
+        Args:
+            image_tokens (torch.LongTensor): Batch of token IDs.
+        Returns:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                Pixel values decoded from the token IDs.
+        """
+        if image_tokens.shape[1] != self.quantize.quant_state_dims[0] * self.quantize.quant_state_dims[1]:
+            raise ValueError(
+                f"Expected `image_tokens` to have shape `(batch_size, {self.quantize.quant_state_dims[0] * self.quantize.quant_state_dims[1]})`, "
+                f"but got shape `{image_tokens.shape}`."
+            )
+        codebook_entry = self.quantize.get_codebook_entry(image_tokens)
+        hidden_states = self.post_quant_conv(codebook_entry)
+        pixel_values = self.decoder(hidden_states)
+        return pixel_values
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        batch_size = pixel_values.shape[0]
+        quant, embedding_loss, indices = self.encode(pixel_values)
+        decoded_pixel_values = self.decode(indices.view(batch_size, -1))
+
+        return JanusVQVAEOutput(decoded_pixel_values, embedding_loss)
+
+
+class JanusVQVAEAlignerMLP(nn.Module):
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__()
+
+        self.fc1 = nn.Linear(config.embed_dim, config.projection_dim)
+        self.hidden_layers = nn.ModuleList(
+            [nn.Linear(config.projection_dim, config.projection_dim) for _ in range(1, config.num_hidden_layers)]
+        )
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        for layer in self.hidden_layers:
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEHead(nn.Module):
+    """Head used for sampling tokens in image generation, replacing the usual lm head."""
+
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__()
+        self.proj_out = nn.Linear(config.image_token_embed_dim, config.projection_dim)
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.vision_head = nn.Linear(config.projection_dim, config.num_embeddings)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.tensor:
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.vision_head(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
+    """
+)
+class JanusModel(JanusPreTrainedModel):
+    def __init__(self, config: JanusConfig):
+        super().__init__(config)
+        self.config = config
+        # This is necessary for backward compatibility, see SiglipModel initialization
+        self.vision_model = JanusVisionModel._from_config(config.vision_config)
+        self.aligner = JanusVisionAlignerMLP(self.vision_model.config)
+
+        self.vqmodel = JanusVQVAE._from_config(config.vq_config)
+
+        # Below generation_* modules are used for Image generation.
+        # Embeddings used for image generation, instead of Janus vision embeddings.
+        self.generation_embeddings = nn.Embedding(self.vqmodel.config.num_embeddings, self.vqmodel.config.embed_dim)
+        self.generation_aligner = JanusVQVAEAlignerMLP(self.vqmodel.config)
+        self.generation_head = JanusVQVAEHead(self.vqmodel.config)
+
+        self.language_model = AutoModel.from_config(config=config.text_config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_image_features(self, pixel_values):
+        image_embeds = self.vision_model(pixel_values)
+        image_embeds = self.aligner(image_embeds.last_hidden_state)
+        return image_embeds
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ):
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values)
+            image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            image_attention_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features)
+
+        lm_output = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        return JanusBaseModelOutputWithPast(
+            last_hidden_state=lm_output.last_hidden_state,
+            past_key_values=lm_output.past_key_values,
+            hidden_states=lm_output.hidden_states,
+            attentions=lm_output.attentions,
+            image_hidden_states=image_embeds if pixel_values is not None else None,
+        )
+
+
+class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"]
+    _can_compile_fullgraph = True
+
+    def __init__(self, config: JanusConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = JanusModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.language_model.set_input_embeddings(value)
+
+    def prepare_embeddings_for_image_generation(self, inputs: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.model.generation_embeddings(inputs)
+        hidden_state = self.model.generation_aligner(hidden_state)
+        return hidden_state
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return JanusCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        pixel_values=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- extra custom processing
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    def decode_image_tokens(self, image_tokens: torch.Tensor):
+        """
+        Decodes generated image tokens from language model to continuous pixel values
+        with VQGAN module via upsampling.
+        Args:
+            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
+                The tensors corresponding to the input images.
+        """
+        decoded_image = self.model.vqmodel.decode(image_tokens)
+        decoded_image = decoded_image.permute(0, 2, 3, 1)
+        return decoded_image
+
+    @torch.no_grad
+    def generate(
+        self,
+        inputs: torch.Tensor = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        **kwargs,
+    ):
+        # 1. Handle generation config and model kwargs
+        generation_config = kwargs.pop("generation_config", self.generation_config)
+        generation_config = copy.deepcopy(generation_config)
+
+        # Default to "text" generation if mode isn't provided
+        generation_mode = kwargs.pop("generation_mode", "text")
+        if generation_mode == "text":
+            # Set guidance_scale=None to prevent running UnbatchedCFG processor.
+            return super().generate(
+                inputs=inputs,
+                attention_mask=attention_mask,
+                generation_config=generation_config,
+                guidance_scale=None,
+                **kwargs,
+            )
+
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+
+        # Validate generation mode
+        if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
+            raise ValueError(
+                "Got incompatible mode for Image Generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        # Validate the configuration and model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 2. Initialize logit processors
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+
+        # Set `use_cache=True` as we will be using input embeds for generation.
+        model_kwargs["use_cache"] = True
+
+        if generation_config.guidance_scale is None:
+            logger.warning("`guidance_scale` is required for CFG but not provided. Setting to default value of 5.")
+            generation_config.guidance_scale = 5
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        # 3. Prepare model inputs
+        input_ids, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        dtype, device = input_ids.dtype, input_ids.device
+
+        if len(input_ids.shape) != 2:
+            raise ValueError(
+                f"Expected input ids of shape (batch_size, seq_len), but got {input_ids.shape}"
+                "Passing `inputs embeds` is not supported currently."
+            )
+
+        # Prepare special tokens which will be used generate internally.
+        kwargs_has_attention_mask = attention_mask is not None
+        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=input_ids.device)
+
+        # 4. Add CFG processor along with user passed logit processor.
+        if generation_config.guidance_scale and generation_config.guidance_scale > 1:
+            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
+            generation_config.guidance_scale = None  # Reset to prevent processor duplication.
+
+        # 5. Prepare logits processor
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids.shape[1],
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+            device=device,
+        )
+
+        # 6. Expand inputs for multiple image generations per prompt.
+        input_ids, model_kwargs = self._expand_inputs_for_generation(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            expand_size=generation_config.num_return_sequences,
+            **model_kwargs,
+        )
+
+        # 7. Prepare input and model caches
+        num_image_tokens = self.model.vision_model.config.num_image_tokens
+        batch_size, seq_len = input_ids.shape
+
+        input_tokens = input_ids.repeat(2, 1)  # Double batch size for conditional/unconditional logits
+        attention_mask = model_kwargs.pop("attention_mask", None)
+        attention_mask = attention_mask.repeat(2, 1)
+        model_kwargs["attention_mask"] = attention_mask
+
+        # Mask all the tokens that are neither BOS nor BOI with pad token in the unconditional logits.
+        mask = (input_tokens[batch_size:, :] != generation_config.bos_token_id) & (
+            input_tokens[batch_size:, :] != generation_config.generation_kwargs["boi_token_id"]
+        )
+        input_tokens[batch_size:, :].masked_fill_(mask, generation_config.pad_token_id)
+
+        inputs_embeds = self.get_input_embeddings()(input_tokens)
+
+        model_kwargs = self._get_initial_cache_position(seq_len, device, model_kwargs)
+
+        if model_kwargs.get("past_key_values", None) is None:
+            # Prepare cache if not provided.
+            model_kwargs["past_key_values"] = self._get_cache(
+                cache_implementation=generation_config.cache_implementation or "static",
+                # batch_size should account for both conditional/unconditional input; hence multiplied by 2.
+                batch_size=batch_size * 2,
+                # we should have at least a cache len of seq_len + num_image_tokens.
+                max_cache_len=max(generation_config.max_length, num_image_tokens + seq_len),
+                model_kwargs=model_kwargs,
+            )
+
+        # Placeholder for generated tokens.
+        generated_tokens = torch.zeros((batch_size, num_image_tokens), dtype=dtype, device=device)
+
+        # 8. init attention / hidden states / scores tuples
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+
+        raw_scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+
+        for i in range(num_image_tokens):
+            model_inputs = self.prepare_inputs_for_generation(
+                inputs_embeds=inputs_embeds, input_ids=input_tokens, **model_kwargs
+            )
+
+            model_inputs["attention_mask"] = model_inputs["attention_mask"].to(inputs_embeds.device)
+            model_inputs["cache_position"] = model_inputs["cache_position"].to(inputs_embeds.device)
+
+            outputs = self.model.language_model(
+                **model_inputs,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            # Update model_kwargs like cache_position for next generation.
+            model_kwargs = self._update_model_kwargs_for_generation(outputs, model_kwargs)
+            hidden_state = outputs.last_hidden_state[:, -1, :].clone()
+
+            # Generate scores using the generation head (Not using above defined LM Head)
+            scores = self.model.generation_head(hidden_state)
+            next_token_scores = logits_processor(input_ids, scores)
+
+            # Sample next token.
+            if generation_config.do_sample:
+                probs = torch.softmax(next_token_scores, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1).squeeze(-1)
+            else:
+                next_token = torch.argmax(next_token_scores, dim=-1)
+
+            generated_tokens[:, i] = next_token
+
+            # Prepare embeddings for the next step.
+            next_token = torch.cat([next_token, next_token])
+            next_token = next_token.unsqueeze(-1)
+
+            inputs_embeds = self.prepare_embeddings_for_image_generation(next_token)
+
+        if return_dict_in_generate:
+            if output_scores:
+                raw_scores += (scores,)
+            if output_logits:
+                raw_logits += (hidden_state.float(),)
+            if output_attentions:
+                decoder_attentions += outputs.attentions
+            if output_hidden_states:
+                decoder_hidden_states += outputs.hidden_states
+
+        if return_dict_in_generate:
+            return GenerateDecoderOnlyOutput(
+                sequences=generated_tokens,
+                scores=scores,
+                logits=raw_logits,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=outputs.past_key_values,
+            )
+        else:
+            return generated_tokens
+
+
+__all__ = ["JanusPreTrainedModel", "JanusForConditionalGeneration", "JanusModel", "JanusVQVAE", "JanusVisionModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/modular_janus.py b/phivenv/Lib/site-packages/transformers/models/janus/modular_janus.py
new file mode 100644
index 0000000000000000000000000000000000000000..d959b7e7b04d4276df4bafb094dd4cc6f4f201dc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/janus/modular_janus.py
@@ -0,0 +1,1611 @@
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from transformers.models.blip.image_processing_blip import BlipImageProcessor
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import ClassifierFreeGuidanceLogitsProcessor, GenerationMixin, GenerationMode, LogitsProcessorList
+from ...generation.utils import GenerateDecoderOnlyOutput
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+)
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torch_available,
+    is_vision_available,
+    logging,
+)
+from ..auto import AutoModel
+from ..blip_2.modeling_blip_2 import Blip2VisionModel
+from ..chameleon.configuration_chameleon import ChameleonVQVAEConfig
+from ..chameleon.modeling_chameleon import (
+    ChameleonVQVAE,
+    ChameleonVQVAEEncoderAttnBlock,
+    ChameleonVQVAEEncoderConvDownsample,
+    ChameleonVQVAEEncoderResnetBlock,
+    ChameleonVQVAEVectorQuantizer,
+)
+from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast
+from ..llama.modeling_llama import eager_attention_forward
+from ..siglip.configuration_siglip import SiglipVisionConfig
+from ..siglip.modeling_siglip import SiglipEncoder, SiglipEncoderLayer, SiglipVisionEmbeddings
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.utils.checkpoint
+
+if is_vision_available():
+    import PIL
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+
+
+class JanusVisionConfig(SiglipVisionConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
+    `JanusVisionModel` according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 384):
+            The size (resolution) of each image.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, and `"gelu_new"` are supported.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys, and values in the attention layers.
+        hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for fully connected layers in the encoder.
+        projection_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the MLP projection head.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the projection layer.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the query and key matrices.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal initializer for initializing all weight matrices.
+        depth (`int`, *optional*, defaults to 2):
+            Number of hidden layers in the aligner module.
+        num_image_tokens (`int`, *optional*, defaults to 576):
+            Number of image tokens.
+    """
+
+    model_type = "janus_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        patch_size=16,
+        image_size=384,
+        attention_dropout=0.0,
+        layer_norm_eps=1e-6,
+        hidden_act="gelu",
+        mlp_ratio=4.0,
+        attention_bias=True,
+        hidden_dropout_rate=0.0,
+        projection_dim=2048,
+        projection_dropout=0.0,
+        use_qk_norm=False,
+        initializer_range=0.02,
+        depth=2,
+        num_image_tokens=576,
+        **kwargs,
+    ):
+        super().__init__(
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_channels=num_channels,
+            patch_size=patch_size,
+            image_size=image_size,
+            attention_dropout=attention_dropout,
+            layer_norm_eps=layer_norm_eps,
+            hidden_act=hidden_act,
+            **kwargs,
+        )
+        del self.intermediate_size
+
+        self.mlp_ratio = mlp_ratio
+        self.attention_bias = attention_bias
+        self.hidden_dropout_rate = hidden_dropout_rate
+        self.projection_dim = projection_dim
+        self.projection_dropout = projection_dropout
+        self.use_qk_norm = use_qk_norm
+        self.initializer_range = initializer_range
+        self.depth = depth
+        self.num_image_tokens = num_image_tokens
+
+
+class JanusVQVAEConfig(ChameleonVQVAEConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
+    `JanusVQVAEModel` according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a
+    configuration with the defaults will yield a similar configuration to the VQModel of the
+    [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).
+
+    Args:
+        embed_dim (`int`, *optional*, defaults to 8):
+            Dimensionality of each embedding vector.
+        num_embeddings (`int`, *optional*, defaults to 16384):
+            Number of codebook embeddings.
+        double_latent (`bool`, *optional*, defaults to `False`):
+            Whether to use double z channels.
+        latent_channels (`int`, *optional*, defaults to 256):
+            Number of channels for the latent space.
+        num_patches (`int`, *optional*, defaults to 32):
+            Num of patches the input images can be divided into.
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            Number of out channels.
+        base_channels (`int`, *optional*, defaults to 128):
+            Base channel count.
+        channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
+            Channel multipliers for each resolution.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Number of residual blocks.
+        dropout (`float`, *optional*, defaults to 0.0):
+            Dropout rate.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        projection_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the MLP projection head.
+        num_hidden_layers (`int`, *optional*, defaults to 2):
+            Number of hidden layers in VAVAE MLP Connecter module.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        image_token_embed_dim (`int`, *optional*, defaults to 2048):
+            Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int = 8,
+        num_embeddings: int = 16384,
+        double_latent: bool = False,
+        latent_channels: int = 256,
+        num_patches: int = 32,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        base_channels: int = 128,
+        channel_multiplier: list[int] = [1, 1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        dropout: float = 0.0,
+        initializer_range=0.02,
+        projection_dim=2048,
+        num_hidden_layers=2,
+        hidden_act="gelu",
+        image_token_embed_dim=2048,
+        **kwargs,
+    ):
+        super().__init__(
+            embed_dim=embed_dim,
+            num_embeddings=num_embeddings,
+            double_latent=double_latent,
+            latent_channels=latent_channels,
+            in_channels=in_channels,
+            base_channels=base_channels,
+            channel_multiplier=channel_multiplier,
+            num_res_blocks=num_res_blocks,
+            dropout=dropout,
+            initializer_range=initializer_range,
+            **kwargs,
+        )
+        self.num_patches = num_patches
+        self.out_channels = out_channels
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_act = hidden_act
+        self.image_token_embed_dim = image_token_embed_dim
+
+        del self.resolution
+        del self.attn_resolutions
+        del self.attn_type
+
+
+class JanusConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
+    Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.
+
+    e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
+    [deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
+            The config object or dictionary of the VQVAE backbone.
+        image_token_id (`int`, *optional*, defaults to 100581):
+            Token index of a placeholder image token.
+
+    Example:
+
+    ```python
+    >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig
+
+    >>> # Initializing a Janus vision config
+    >>> vision_config = JanusVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a VQ config
+    >>> vq_config = JanusVQVAEConfig()
+
+    >>> # Initializing a Janus Pro 1B style configuration
+    >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)
+
+    >>> # Initializing a model from the Janus Pro 1B style configuration
+    >>> model = JanusForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "janus"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "vision_config": JanusVisionConfig,
+        "vq_config": JanusVQVAEConfig,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        vq_config=None,
+        image_token_id=100581,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+        elif text_config is None:
+            logger.info("`text_config` is None. Initializing with default values")
+            self.text_config = CONFIG_MAPPING["llama"]()
+        elif isinstance(text_config, PretrainedConfig):
+            self.text_config = text_config
+        else:
+            raise ValueError(
+                f"Invalid type for `text_config`. Must be either `dict` or `LlamaConfig`."
+                f" Type found: {type(text_config)}"
+            )
+
+        if vision_config is None:
+            logger.info("`vision_config` is None. Initializing with default JanusVisionConfig values")
+            self.vision_config = JanusVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = JanusVisionConfig(**vision_config)
+        elif isinstance(vision_config, JanusVisionConfig):
+            self.vision_config = vision_config
+        else:
+            raise ValueError(
+                f"Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`."
+                f" Type found: {type(vision_config)}"
+            )
+
+        if vq_config is None:
+            logger.info("`vq_config` is None. Initializing with default JanusVQVAEConfig values")
+            self.vq_config = JanusVQVAEConfig()
+        elif isinstance(vq_config, dict):
+            self.vq_config = JanusVQVAEConfig(**vq_config)
+        elif isinstance(vq_config, JanusVQVAEConfig):
+            self.vq_config = vq_config
+        else:
+            raise ValueError(
+                f"Invalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`."
+                f" Type found: {type(vq_config)}"
+            )
+
+        self.initializer_range = self.vision_config.initializer_range
+        # This dimension is required when decoding discrete image tokens to continuous input.
+        self.vq_config.num_patches = self.vision_config.image_size // self.vision_config.patch_size
+        # The default is only the index for the 1B model, 7B uses a different one
+        self.image_token_id = image_token_id
+        super().__init__(**kwargs)
+
+
+@auto_docstring
+class JanusPreTrainedModel(PreTrainedModel):
+    config: JanusConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"]
+    _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_param_buffer_assignment = False
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Janus VQ-VAE mode model outputs.
+    """
+)
+class JanusVQVAEOutput(ModelOutput):
+    r"""
+    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+        Reconstructed pixel values after encoding and decoding the input.
+    embedding_loss (`torch.FloatTensor`):
+        Embedding loss.
+    """
+
+    decoded_pixel_values: Optional[torch.FloatTensor] = None
+    embedding_loss: torch.FloatTensor = None
+
+
+class JanusBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast):
+    pass
+
+
+class JanusCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast):
+    pass
+
+
+class JanusVisionEmbeddings(SiglipVisionEmbeddings):
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        if interpolate_pos_encoding:
+            pos_embeds = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            pos_embeds = self.position_embedding(self.position_ids)
+
+        embeddings = embeddings + pos_embeds
+
+        return embeddings
+
+
+class JanusVisionAttention(nn.Module):
+    """Attention Class for Janus Vision Encoder"""
+
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        proj_dropout = config.projection_dropout
+        qk_norm = config.use_qk_norm
+        self.is_causal = False
+
+        # Janus has no MHA, hence for `eager_attention_forward` call setting `num_key_value_groups` to 1.
+        self.num_key_value_groups = 1
+
+        self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim)
+        self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity()
+
+        self.q_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity()
+        self.k_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        batch_size, seq_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape(-1, self.num_heads, self.head_dim)
+        query_states = self.q_norm(query_states)
+
+        key_states = key_states.reshape(-1, self.num_heads, self.head_dim)
+        key_states = self.k_norm(key_states)
+
+        query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=self.is_causal,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
+
+        output = self.projection_layer(attn_output)
+        output = self.projection_dropout(output)
+        return output, attn_weights
+
+
+class JanusVisionMLP(nn.Module):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+        self.config = config
+        self.intermediate_size = int(config.hidden_size * config.mlp_ratio)
+        self.activation_fn = ACT2FN[config.hidden_act]  # Gelu act
+        self.fc1 = nn.Linear(config.hidden_size, self.intermediate_size)
+        self.fc2 = nn.Linear(self.intermediate_size, config.hidden_size)
+        self.dropout1 = nn.Dropout(config.hidden_dropout_rate)
+        self.dropout2 = nn.Dropout(config.hidden_dropout_rate)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.dropout1(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout2(hidden_states)
+        return hidden_states
+
+
+class JanusVisionEncoderLayer(SiglipEncoderLayer):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.self_attn = JanusVisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = JanusVisionMLP(config)
+
+
+class JanusVisionEncoder(SiglipEncoder):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList([JanusVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+
+class JanusVisionModel(Blip2VisionModel):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__(config)
+        self.encoder = JanusVisionEncoder(config)
+
+
+class JanusVisionAlignerMLP(nn.Module):
+    def __init__(self, config: JanusVisionConfig):
+        super().__init__()
+
+        self.fc1 = nn.Linear(config.hidden_size, config.projection_dim)
+        self.hidden_layers = nn.ModuleList(
+            [nn.Linear(config.projection_dim, config.projection_dim) for _ in range(1, config.depth)]
+        )
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        for layer in self.hidden_layers:
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEVectorQuantizer(ChameleonVQVAEVectorQuantizer):
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__(config)
+        self.quant_state_dims = [config.num_patches] * 2
+
+    def get_codebook_entry(self, image_tokens: torch.LongTensor) -> torch.FloatTensor:
+        batch_size = image_tokens.shape[0]
+        emb_dim: int = self.embedding.weight.shape[-1]
+
+        # get quantized latent vectors
+        hidden_state_quant = self.embedding(image_tokens)
+        # l2 normalization on the last dimension
+        hidden_state_quant = F.normalize(hidden_state_quant, p=2, dim=-1)
+
+        # reshape back to match original input shape
+        hidden_state_quant = hidden_state_quant.view((batch_size, *self.quant_state_dims, emb_dim))
+        hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
+
+        return hidden_state_quant
+
+
+class JanusVQVAEResnetBlock(ChameleonVQVAEEncoderResnetBlock):
+    pass
+
+
+class JanusVQVAEAttnBlock(ChameleonVQVAEEncoderAttnBlock):
+    pass
+
+
+class JanusVQVAEConvDownsample(ChameleonVQVAEEncoderConvDownsample):
+    pass
+
+
+class JanusVQVAEConvUpsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_states):
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEMidBlock(nn.Module):
+    def __init__(self, config: JanusVQVAEConfig, channels: int):
+        super().__init__()
+        self.block_1 = JanusVQVAEResnetBlock(
+            config=config,
+            in_channels=channels,
+            out_channels=channels,
+        )
+        self.attn_1 = JanusVQVAEAttnBlock(channels)
+        self.block_2 = JanusVQVAEResnetBlock(
+            config=config,
+            in_channels=channels,
+            out_channels=channels,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.block_1(hidden_states)
+        hidden_states = self.attn_1(hidden_states)
+        hidden_states = self.block_2(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+
+        self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1)
+
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    JanusVQVAEResnetBlock(
+                        config=config,
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn.append(JanusVQVAEAttnBlock(block_in))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = JanusVQVAEConvDownsample(block_in)
+            self.down.append(down)
+
+        self.mid = JanusVQVAEMidBlock(config, block_in)
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * latent_channels if double_latent else latent_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, pixel_values: torch.LongTensor):
+        # downsampling
+        hidden_states = [self.conv_in(pixel_values)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                hidden_state = self.down[i_level].block[i_block](
+                    hidden_states[-1],
+                )
+                if len(self.down[i_level].attn) > 0:
+                    hidden_state = self.down[i_level].attn[i_block](hidden_state)
+                hidden_states.append(hidden_state)
+            if i_level != self.num_resolutions - 1:
+                hidden_states.append(self.down[i_level].downsample(hidden_states[-1]))
+
+        # middle
+        last_hidden_state = hidden_states[-1]
+        last_hidden_state = self.mid(last_hidden_state)
+
+        # end
+        last_hidden_state = self.norm_out(last_hidden_state)
+        last_hidden_state *= torch.sigmoid(last_hidden_state)
+        last_hidden_state = self.conv_out(last_hidden_state)
+        return last_hidden_state
+
+
+class JanusVQVAEDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        latent_channels = config.latent_channels
+        out_channels = config.out_channels
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = base_channels * config.channel_multiplier[self.num_resolutions - 1]
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(latent_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = JanusVQVAEMidBlock(config, block_in)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = base_channels * config.channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    JanusVQVAEResnetBlock(
+                        config=config,
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn.append(JanusVQVAEAttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = JanusVQVAEConvUpsample(block_in)
+            self.up.append(up)
+
+        # end
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_state: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_state = self.conv_in(hidden_state)
+
+        # middle
+        hidden_state = self.mid(hidden_state)
+
+        # upsampling
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden_state = self.up[i_level].block[i_block](hidden_state)
+                if len(self.up[i_level].attn) > 0:
+                    hidden_state = self.up[i_level].attn[i_block](hidden_state)
+            if i_level != self.num_resolutions - 1:
+                hidden_state = self.up[i_level].upsample(hidden_state)
+
+        hidden_state = self.norm_out(hidden_state)
+        hidden_state *= torch.sigmoid(hidden_state)
+        hidden_state = self.conv_out(hidden_state)
+        return hidden_state
+
+
+class JanusVQVAE(ChameleonVQVAE):
+    _no_split_modules = [
+        "JanusVQVAEAttnBlock",
+        "JanusVQVAEResnetBlock",
+        "JanusVQVAEVectorQuantizer",
+    ]
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__(config)
+        self.decoder = JanusVQVAEDecoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize the VQVAE model.
+        self.post_init()
+
+    def decode(self, image_tokens: torch.LongTensor) -> torch.FloatTensor:
+        """
+        Decodes quantized token IDs into pixel values.
+        Args:
+            image_tokens (torch.LongTensor): Batch of token IDs.
+        Returns:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                Pixel values decoded from the token IDs.
+        """
+        if image_tokens.shape[1] != self.quantize.quant_state_dims[0] * self.quantize.quant_state_dims[1]:
+            raise ValueError(
+                f"Expected `image_tokens` to have shape `(batch_size, {self.quantize.quant_state_dims[0] * self.quantize.quant_state_dims[1]})`, "
+                f"but got shape `{image_tokens.shape}`."
+            )
+        codebook_entry = self.quantize.get_codebook_entry(image_tokens)
+        hidden_states = self.post_quant_conv(codebook_entry)
+        pixel_values = self.decoder(hidden_states)
+        return pixel_values
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        batch_size = pixel_values.shape[0]
+        quant, embedding_loss, indices = self.encode(pixel_values)
+        decoded_pixel_values = self.decode(indices.view(batch_size, -1))
+
+        return JanusVQVAEOutput(decoded_pixel_values, embedding_loss)
+
+
+class JanusVQVAEAlignerMLP(nn.Module):
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__()
+
+        self.fc1 = nn.Linear(config.embed_dim, config.projection_dim)
+        self.hidden_layers = nn.ModuleList(
+            [nn.Linear(config.projection_dim, config.projection_dim) for _ in range(1, config.num_hidden_layers)]
+        )
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        for layer in self.hidden_layers:
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class JanusVQVAEHead(nn.Module):
+    """Head used for sampling tokens in image generation, replacing the usual lm head."""
+
+    def __init__(self, config: JanusVQVAEConfig):
+        super().__init__()
+        self.proj_out = nn.Linear(config.image_token_embed_dim, config.projection_dim)
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.vision_head = nn.Linear(config.projection_dim, config.num_embeddings)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.tensor:
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.vision_head(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
+    """
+)
+class JanusModel(JanusPreTrainedModel):
+    def __init__(self, config: JanusConfig):
+        super().__init__(config)
+        self.config = config
+        # This is necessary for backward compatibility, see SiglipModel initialization
+        self.vision_model = JanusVisionModel._from_config(config.vision_config)
+        self.aligner = JanusVisionAlignerMLP(self.vision_model.config)
+
+        self.vqmodel = JanusVQVAE._from_config(config.vq_config)
+
+        # Below generation_* modules are used for Image generation.
+        # Embeddings used for image generation, instead of Janus vision embeddings.
+        self.generation_embeddings = nn.Embedding(self.vqmodel.config.num_embeddings, self.vqmodel.config.embed_dim)
+        self.generation_aligner = JanusVQVAEAlignerMLP(self.vqmodel.config)
+        self.generation_head = JanusVQVAEHead(self.vqmodel.config)
+
+        self.language_model = AutoModel.from_config(config=config.text_config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_image_features(self, pixel_values):
+        image_embeds = self.vision_model(pixel_values)
+        image_embeds = self.aligner(image_embeds.last_hidden_state)
+        return image_embeds
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ):
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values)
+            image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            image_attention_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features)
+
+        lm_output = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        return JanusBaseModelOutputWithPast(
+            last_hidden_state=lm_output.last_hidden_state,
+            past_key_values=lm_output.past_key_values,
+            hidden_states=lm_output.hidden_states,
+            attentions=lm_output.attentions,
+            image_hidden_states=image_embeds if pixel_values is not None else None,
+        )
+
+
+class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"]
+    _can_compile_fullgraph = True
+
+    def __init__(self, config: JanusConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = JanusModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.language_model.set_input_embeddings(value)
+
+    def prepare_embeddings_for_image_generation(self, inputs: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.model.generation_embeddings(inputs)
+        hidden_state = self.model.generation_aligner(hidden_state)
+        return hidden_state
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return JanusCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        pixel_values=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- extra custom processing
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    def decode_image_tokens(self, image_tokens: torch.Tensor):
+        """
+        Decodes generated image tokens from language model to continuous pixel values
+        with VQGAN module via upsampling.
+        Args:
+            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
+                The tensors corresponding to the input images.
+        """
+        decoded_image = self.model.vqmodel.decode(image_tokens)
+        decoded_image = decoded_image.permute(0, 2, 3, 1)
+        return decoded_image
+
+    @torch.no_grad
+    def generate(
+        self,
+        inputs: torch.Tensor = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        **kwargs,
+    ):
+        # 1. Handle generation config and model kwargs
+        generation_config = kwargs.pop("generation_config", self.generation_config)
+        generation_config = copy.deepcopy(generation_config)
+
+        # Default to "text" generation if mode isn't provided
+        generation_mode = kwargs.pop("generation_mode", "text")
+        if generation_mode == "text":
+            # Set guidance_scale=None to prevent running UnbatchedCFG processor.
+            return super().generate(
+                inputs=inputs,
+                attention_mask=attention_mask,
+                generation_config=generation_config,
+                guidance_scale=None,
+                **kwargs,
+            )
+
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+
+        # Validate generation mode
+        if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
+            raise ValueError(
+                "Got incompatible mode for Image Generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        # Validate the configuration and model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 2. Initialize logit processors
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+
+        # Set `use_cache=True` as we will be using input embeds for generation.
+        model_kwargs["use_cache"] = True
+
+        if generation_config.guidance_scale is None:
+            logger.warning("`guidance_scale` is required for CFG but not provided. Setting to default value of 5.")
+            generation_config.guidance_scale = 5
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        # 3. Prepare model inputs
+        input_ids, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        dtype, device = input_ids.dtype, input_ids.device
+
+        if len(input_ids.shape) != 2:
+            raise ValueError(
+                f"Expected input ids of shape (batch_size, seq_len), but got {input_ids.shape}"
+                "Passing `inputs embeds` is not supported currently."
+            )
+
+        # Prepare special tokens which will be used generate internally.
+        kwargs_has_attention_mask = attention_mask is not None
+        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=input_ids.device)
+
+        # 4. Add CFG processor along with user passed logit processor.
+        if generation_config.guidance_scale and generation_config.guidance_scale > 1:
+            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
+            generation_config.guidance_scale = None  # Reset to prevent processor duplication.
+
+        # 5. Prepare logits processor
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids.shape[1],
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+            device=device,
+        )
+
+        # 6. Expand inputs for multiple image generations per prompt.
+        input_ids, model_kwargs = self._expand_inputs_for_generation(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            expand_size=generation_config.num_return_sequences,
+            **model_kwargs,
+        )
+
+        # 7. Prepare input and model caches
+        num_image_tokens = self.model.vision_model.config.num_image_tokens
+        batch_size, seq_len = input_ids.shape
+
+        input_tokens = input_ids.repeat(2, 1)  # Double batch size for conditional/unconditional logits
+        attention_mask = model_kwargs.pop("attention_mask", None)
+        attention_mask = attention_mask.repeat(2, 1)
+        model_kwargs["attention_mask"] = attention_mask
+
+        # Mask all the tokens that are neither BOS nor BOI with pad token in the unconditional logits.
+        mask = (input_tokens[batch_size:, :] != generation_config.bos_token_id) & (
+            input_tokens[batch_size:, :] != generation_config.generation_kwargs["boi_token_id"]
+        )
+        input_tokens[batch_size:, :].masked_fill_(mask, generation_config.pad_token_id)
+
+        inputs_embeds = self.get_input_embeddings()(input_tokens)
+
+        model_kwargs = self._get_initial_cache_position(seq_len, device, model_kwargs)
+
+        if model_kwargs.get("past_key_values", None) is None:
+            # Prepare cache if not provided.
+            model_kwargs["past_key_values"] = self._get_cache(
+                cache_implementation=generation_config.cache_implementation or "static",
+                # batch_size should account for both conditional/unconditional input; hence multiplied by 2.
+                batch_size=batch_size * 2,
+                # we should have at least a cache len of seq_len + num_image_tokens.
+                max_cache_len=max(generation_config.max_length, num_image_tokens + seq_len),
+                model_kwargs=model_kwargs,
+            )
+
+        # Placeholder for generated tokens.
+        generated_tokens = torch.zeros((batch_size, num_image_tokens), dtype=dtype, device=device)
+
+        # 8. init attention / hidden states / scores tuples
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+
+        raw_scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+
+        for i in range(num_image_tokens):
+            model_inputs = self.prepare_inputs_for_generation(
+                inputs_embeds=inputs_embeds, input_ids=input_tokens, **model_kwargs
+            )
+
+            model_inputs["attention_mask"] = model_inputs["attention_mask"].to(inputs_embeds.device)
+            model_inputs["cache_position"] = model_inputs["cache_position"].to(inputs_embeds.device)
+
+            outputs = self.model.language_model(
+                **model_inputs,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            # Update model_kwargs like cache_position for next generation.
+            model_kwargs = self._update_model_kwargs_for_generation(outputs, model_kwargs)
+            hidden_state = outputs.last_hidden_state[:, -1, :].clone()
+
+            # Generate scores using the generation head (Not using above defined LM Head)
+            scores = self.model.generation_head(hidden_state)
+            next_token_scores = logits_processor(input_ids, scores)
+
+            # Sample next token.
+            if generation_config.do_sample:
+                probs = torch.softmax(next_token_scores, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1).squeeze(-1)
+            else:
+                next_token = torch.argmax(next_token_scores, dim=-1)
+
+            generated_tokens[:, i] = next_token
+
+            # Prepare embeddings for the next step.
+            next_token = torch.cat([next_token, next_token])
+            next_token = next_token.unsqueeze(-1)
+
+            inputs_embeds = self.prepare_embeddings_for_image_generation(next_token)
+
+        if return_dict_in_generate:
+            if output_scores:
+                raw_scores += (scores,)
+            if output_logits:
+                raw_logits += (hidden_state.float(),)
+            if output_attentions:
+                decoder_attentions += outputs.attentions
+            if output_hidden_states:
+                decoder_hidden_states += outputs.hidden_states
+
+        if return_dict_in_generate:
+            return GenerateDecoderOnlyOutput(
+                sequences=generated_tokens,
+                scores=scores,
+                logits=raw_logits,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=outputs.past_key_values,
+            )
+        else:
+            return generated_tokens
+
+
+class JanusImageProcessor(BlipImageProcessor):
+    r"""
+    Constructs a JANUS image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        min_size (`int`, *optional*, defaults to 14):
+            The minimum allowed size for the resized image. Ensures that neither the height nor width
+            falls below this value after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        min_size: int = 14,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.min_size = min_size
+        if image_mean is None:
+            self.background_color = (127, 127, 127)
+        else:
+            self.background_color = tuple(int(x * 255) for x in image_mean)
+
+    def pad_to_square(
+        self,
+        image: np.ndarray,
+        background_color: Union[int, tuple[int, int, int]] = 0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            image (`np.ndarray`):
+                The image to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in mutli-channel mode, it will default to `0` in subsequent channels.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The padded image.
+        """
+        height, width = get_image_size(image, input_data_format)
+        num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+        if height == width:
+            image = (
+                to_channel_dimension_format(image, data_format, input_data_format)
+                if data_format is not None
+                else image
+            )
+            return image
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        if input_data_format == ChannelDimension.FIRST:
+            result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[i, :, :] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[:, start : start + height, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, :, start : start + width] = image
+        else:
+            result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[:, :, i] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[start : start + height, :, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, start : start + width, :] = image
+
+        return result
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Union[dict[str, int], int],
+        background_color: Optional[tuple[int, int, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to dynamically calculated size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]` or `int`):
+                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
+            background_color (`tuple[int, int, int]`):
+                The background color to use for the padding.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `None`: will be inferred from input
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        background_color = background_color if background_color is not None else self.background_color
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        height, width = get_image_size(image, input_data_format)
+        max_size = max(height, width)
+
+        size = get_size_dict(size, default_to_square=True)
+        if size["height"] != size["width"]:
+            raise ValueError(
+                f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+            )
+        size = size["height"]
+
+        delta = size / max_size
+        # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+        output_size_nonpadded = [
+            max(int(height * delta), self.min_size),
+            max(int(width * delta), self.min_size),
+        ]
+
+        image = resize(
+            image,
+            size=output_size_nonpadded,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        # Expand and pad the images to obtain a square image of dimensions `size x size`
+        image = self.pad_to_square(
+            image=image,
+            background_color=background_color,
+            input_data_format=input_data_format,
+        )
+        return image
+
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[list[float]] = None,
+        image_std: Optional[list[float]] = None,
+        input_data_format: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+    ):
+        """Applies post-processing to the decoded image tokens by reversing transformations applied during preprocessing."""
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)  # Ensures input is a list
+
+        if isinstance(images[0], PIL.Image.Image):
+            return images if len(images) > 1 else images[0]
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])  # Determine format dynamically
+
+        pixel_values = []
+
+        for image in images:
+            image = to_numpy_array(image)  # Ensure NumPy format
+
+            if do_normalize:
+                image = self.unnormalize(
+                    image=image, image_mean=image_mean, image_std=image_std, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+                image = image.clip(0, 255).astype(np.uint8)
+
+            if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+                image = to_channel_dimension_format(image, ChannelDimension.LAST, input_channel_dim=input_data_format)
+                image = PIL.Image.fromarray(image)
+
+            pixel_values.append(image)
+
+        data = {"pixel_values": pixel_values}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def unnormalize(
+        self,
+        image: np.array,
+        image_mean: Union[float, Iterable[float]],
+        image_std: Union[float, Iterable[float]],
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+        image = (image * image_std) + image_mean
+        Args:
+            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
+                Batch of pixel values to postprocess.
+            image_mean (`float` or `Iterable[float]`):
+                The mean to use for unnormalization.
+            image_std (`float` or `Iterable[float]`):
+                The standard deviation to use for unnormalization.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        num_channels = 3
+
+        if isinstance(image_mean, Iterable):
+            if len(image_mean) != num_channels:
+                raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(image_mean)}")
+        else:
+            image_mean = [image_mean] * num_channels
+
+        if isinstance(image_std, Iterable):
+            if len(image_std) != num_channels:
+                raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(image_std)}")
+        else:
+            image_std = [image_std] * num_channels
+
+        rev_image_mean = tuple(-mean / std for mean, std in zip(image_mean, image_std))
+        rev_image_std = tuple(1 / std for std in image_std)
+        image = self.normalize(
+            image=image, mean=rev_image_mean, std=rev_image_std, input_data_format=input_data_format
+        )
+        return image
+
+
+__all__ = [
+    "JanusImageProcessor",
+    "JanusPreTrainedModel",
+    "JanusForConditionalGeneration",
+    "JanusModel",
+    "JanusVQVAE",
+    "JanusVisionModel",
+    "JanusVQVAEConfig",
+    "JanusVisionConfig",
+    "JanusConfig",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/janus/processing_janus.py b/phivenv/Lib/site-packages/transformers/models/janus/processing_janus.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c106002e42e441c55226736039e0a8e92411a1f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/janus/processing_janus.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Janus.
+"""
+
+from typing import Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DEFAULT_SYSTEM_PROMPT = (
+    "You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.\n\n"
+)
+
+
+class JanusTextKwargs(TextKwargs, total=False):
+    generation_mode: str
+
+
+class JanusProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: JanusTextKwargs
+    _defaults = {
+        "text_kwargs": {"padding": False, "generation_mode": "text"},
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
+class JanusProcessor(ProcessorMixin):
+    r"""
+    Constructs a Janus processor which wraps a Janus Image Processor and a Llama tokenizer into a single processor.
+
+    [`JanusProcessor`] offers all the functionalities of [`JanusImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~JanusProcessor.__call__`] and [`~JanusProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`JanusImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`]):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        use_default_system_prompt (`str`, *optional*, defaults to `False`):
+            Use default system prompt for Text Generation.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "JanusImageProcessor"
+    tokenizer_class = "LlamaTokenizerFast"
+
+    def __init__(self, image_processor, tokenizer, chat_template=None, use_default_system_prompt=False, **kwargs):
+        self.num_image_tokens = 576
+        self.image_token = tokenizer.image_token
+        self.image_start_token = tokenizer.boi_token
+        self.image_end_token = tokenizer.eoi_token
+        self.use_default_system_prompt = use_default_system_prompt
+
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        videos=None,
+        audio=None,
+        **kwargs: Unpack[JanusProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        JanusImageProcessor's [`~JanusImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        output_kwargs = self._merge_kwargs(
+            JanusProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+        )
+
+        if text is None and images is None:
+            raise ValueError("You must specify either text or images.")
+
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        generation_mode = output_kwargs["text_kwargs"].pop("generation_mode")
+
+        # Replace the image token with expanded image tokens.
+        prompt_strings = []
+        one_img_tokens = self.image_start_token + (self.image_token * self.num_image_tokens) + self.image_end_token
+        for prompt in text:
+            prompt = prompt.replace(self.image_token, one_img_tokens)
+            if self.use_default_system_prompt and generation_mode == "text":
+                prompt = DEFAULT_SYSTEM_PROMPT + prompt
+            if generation_mode == "image":
+                prompt += self.image_start_token
+            prompt_strings.append(prompt)
+
+        data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+
+        # Process images if pixel values are provided.
+        if images is not None and generation_mode != "image":
+            data["pixel_values"] = self.image_processor(images=images, **output_kwargs["images_kwargs"])[
+                "pixel_values"
+            ]
+
+        return BatchFeature(data=data)
+
+    def postprocess(self, images: ImageInput, **kwargs):
+        """
+        Forwards all arguments to the image processor's `postprocess` method.
+        Refer to the original method's docstring for more details.
+        """
+        return self.image_processor.postprocess(images, **kwargs)
+
+
+__all__ = ["JanusProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/jetmoe/__init__.py b/phivenv/Lib/site-packages/transformers/models/jetmoe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7058590acc8859f934330ced9556d6ca66b50a51
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/jetmoe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_jetmoe import *
+    from .modeling_jetmoe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2197904410bddf40e9d4c9e6d82dfcfde881d7f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/configuration_jetmoe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/configuration_jetmoe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..678bd6390693c7def146cf95afb2cf9d81d4954c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/configuration_jetmoe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/modeling_jetmoe.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/modeling_jetmoe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f1f52134acd1e96ab4fddf53c882220f9ef64d3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/jetmoe/__pycache__/modeling_jetmoe.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/jetmoe/configuration_jetmoe.py b/phivenv/Lib/site-packages/transformers/models/jetmoe/configuration_jetmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..5846ea369da7adf7f1a96410ca05808583cc478b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/jetmoe/configuration_jetmoe.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2024 JetMoe AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JetMoe model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class JetMoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JetMoeModel`]. It is used to instantiate a
+    JetMoe model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a configuration of the JetMoe-4B.
+
+    [jetmoe/jetmoe-8b](https://huggingface.co/jetmoe/jetmoe-8b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the JetMoe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`JetMoeModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each key and value in the Transformer encoder.
+        kv_channels (`int`, *optional*, defaults to 128):
+            Defines the number of channels for the key and value tensors.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. JetMoe's attention allows sequence of
+            up to 4096 tokens.
+        activation_function (`string`, *optional*, defaults to `"silu"`):
+            Defines the activation function for MLP experts.
+        num_local_experts (`int`, *optional*, defaults to 8):
+            Defines the number of experts in the MoE and MoA.
+        num_experts_per_tok (`int, *optional*, defaults to 2):
+            The number of experts to route per-token and for MoE and MoA.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss.
+        aux_loss_coef (`float`, *optional*, defaults to 0.01):
+            The coefficient for the auxiliary loss.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import JetMoeModel, JetMoeConfig
+
+    >>> # Initializing a JetMoe 4B style configuration
+    >>> configuration = JetMoeConfig()
+
+    >>> # Initializing a model from the JetMoe 4B style configuration
+    >>> model = JetMoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "jetmoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2048,
+        num_hidden_layers=12,
+        num_key_value_heads=16,
+        kv_channels=128,
+        intermediate_size=5632,
+        max_position_embeddings=4096,
+        activation_function="silu",
+        num_local_experts=8,
+        num_experts_per_tok=2,
+        output_router_logits=False,
+        aux_loss_coef=0.01,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rms_norm_eps=1e-6,
+        initializer_range=0.01,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        if num_experts_per_tok > num_local_experts:
+            raise ValueError("`num_experts_per_tok` must be less than or equal to `num_local_experts`")
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_key_value_heads * num_experts_per_tok
+        self.num_key_value_heads = num_key_value_heads
+        self.kv_channels = kv_channels
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.activation_function = activation_function
+        self.num_local_experts = num_local_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.output_router_logits = output_router_logits
+        self.aux_loss_coef = aux_loss_coef
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.rope_theta = rope_theta
+        self.rms_norm_eps = rms_norm_eps
+
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
+
+
+__all__ = ["JetMoeConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/jetmoe/modeling_jetmoe.py b/phivenv/Lib/site-packages/transformers/models/jetmoe/modeling_jetmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..64a93b71c9cda3711b9f7190fcab98fbd18c8dcd
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/jetmoe/modeling_jetmoe.py
@@ -0,0 +1,1217 @@
+# coding=utf-8
+# Copyright 2024 JetMoe AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch JetMoe model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import functional as F
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_jetmoe import JetMoeConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, routing_weights.shape[1]))
+            .reshape(-1, routing_weights.shape[1])
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
+    overall_loss = torch.sum(
+        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
+    )
+    return overall_loss * num_experts
+
+
+class JetMoeParallelExperts(nn.Module):
+    def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
+        """
+        Initialize the JetMoeParallelExperts module.
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
+        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
+        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
+        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
+        used in vllm.
+
+        Args:
+            num_experts (int):
+                Number of experts.
+            input_size (int):
+                Size of the input.
+            output_size (int):
+                Size of the output.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+
+    def forward(self, inputs, expert_size):
+        """
+        Forward pass of the JetMoeParallelExperts module.
+
+        Args:
+            inputs (Tensor):
+                Input tensor.
+            expert_size:
+                Expert size information.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+        for i in range(self.num_experts):
+            output_list.append(F.linear(input_list[i], self.weight[i]))
+        results = torch.cat(output_list, dim=0)
+        return results
+
+
+class JetMoeTopKGating(nn.Module):
+    def __init__(self, input_size: int, num_experts: int, top_k: int):
+        """
+        Initialize the top-k gating mechanism.
+
+        Args:
+            input_size (`int`):
+                Size of the input.
+            num_experts (`int`):
+                Number of experts.
+            top_k (`int`):
+                Number of top experts to select.
+        """
+        super().__init__()
+
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.top_k = top_k
+
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+
+    def forward(self, hidden_states):
+        # compute the top_k routing decision
+        logits = self.layer(hidden_states).float()  # [batch_size x seq_len, num_experts]
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)  # [num_tokens, top_k]
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(hidden_states)  # [num_tokens, top_k]
+
+        # compute number of input given to each expert
+        zeros = torch.zeros(
+            [top_k_gates.size(0), self.num_experts], dtype=top_k_gates.dtype, device=top_k_gates.device
+        )  # [num_tokens, num_experts]
+        gates = zeros.scatter(1, top_k_indices, 1)  # [num_tokens, num_experts]
+        expert_size = gates.long().sum(0)  # [num_experts,]
+        # (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
+        # (and `DataDependentOutputException`)
+        expert_size = expert_size.tolist()
+
+        # sort and group input tokens according to expert assignment
+        top_k_experts = top_k_indices.flatten()  # [num_tokens * top_k]
+        _, index_sorted_experts = top_k_experts.sort(0)  # [num_tokens * top_k]
+        batch_index = index_sorted_experts.div(self.top_k, rounding_mode="trunc")  # [num_tokens * top_k]
+
+        # gather the gate values for grouped input tokens
+        top_k_gates = top_k_gates.flatten()  # [num_tokens * top_k]
+        batch_gates = top_k_gates[index_sorted_experts]  # [num_tokens * top_k]
+
+        return index_sorted_experts, batch_index, batch_gates, expert_size, logits
+
+
+class JetMoeMoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: JetMoeConfig):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.intermediate_size
+        self.activation = ACT2FN[config.activation_function]
+        self.bias = torch.nn.Parameter(torch.empty(self.input_size))
+        self.input_linear = JetMoeParallelExperts(config.num_local_experts, self.input_size, self.hidden_size * 2)
+        self.output_linear = JetMoeParallelExperts(config.num_local_experts, self.hidden_size, self.input_size)
+
+        self.router = JetMoeTopKGating(
+            input_size=self.input_size,
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+        )
+
+    def forward(self, layer_input):
+        """
+        Forward pass of the mixture of experts layer.
+
+        Args:
+            layer_input (Tensor):
+                Input tensor.
+
+        Returns:
+            Tensor:
+                Output tensor.
+            Tensor:
+                Router logits.
+        """
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)
+        _, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+
+        expert_inputs = layer_input[batch_index]
+        hidden_states = self.input_linear(expert_inputs, expert_size)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        expert_outputs = self.output_linear(hidden_states, expert_size)
+
+        expert_outputs = expert_outputs * batch_gates[:, None]
+
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        layer_output = layer_output + self.bias
+        return layer_output, router_logits
+
+
+class JetMoeMoA(nn.Module):
+    """
+    A Sparsely gated mixture of attention layer with pairs of query- and output-projections as experts.
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: JetMoeConfig):
+        super().__init__()
+
+        self.num_experts = config.num_local_experts
+        self.input_size = config.hidden_size
+        self.hidden_size = config.kv_channels * config.num_key_value_heads
+        self.top_k = config.num_experts_per_tok
+        self.bias = torch.nn.Parameter(torch.empty(self.input_size))
+
+        self.input_linear = JetMoeParallelExperts(self.num_experts, self.input_size, self.hidden_size)
+        self.output_linear = JetMoeParallelExperts(self.num_experts, self.hidden_size, self.input_size)
+
+        self.router = JetMoeTopKGating(
+            input_size=self.input_size,
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+        )
+
+    def map(self, layer_input):
+        """
+        Map inputs to attention experts according to routing decision and compute query projection inside each experts.
+        """
+
+        # Compute gating topology
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)  # [bsz * length, emb_size]
+        index_sorted_experts, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+        topo_info = (index_sorted_experts, batch_index, batch_gates, expert_size)
+
+        # Group inputs according to topology and compute query projection
+        expert_inputs = layer_input[batch_index]  # [bsz * length * top_k, emb_size]
+        expert_outputs = self.input_linear(expert_inputs, expert_size)  # [bsz * length * top_k, hidden_size]
+
+        # Ungroup queries back to original order
+        zeros = torch.zeros(
+            (bsz * length * self.top_k, self.hidden_size), dtype=expert_outputs.dtype, device=expert_outputs.device
+        )
+        layer_output = zeros.index_add(0, index_sorted_experts, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.top_k, -1)  # [bsz, length, top_k, hidden_size]
+        return layer_output, router_logits, topo_info
+
+    def reduce(self, layer_input, topo_info):
+        """
+        Compute output projection inside each attention experts and merge the outputs of different experts.
+        """
+        bsz, length, k, hidden_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, hidden_size)  # [bsz * length * k, hidden_size]
+        index_sorted_experts, batch_index, batch_gates, expert_size = topo_info
+
+        # Group inputs according to topology and compute output projection
+        expert_inputs = layer_input[index_sorted_experts]  # [bsz * length * top_k, hidden_size]
+        expert_outputs = self.output_linear(expert_inputs, expert_size)  # [bsz * length * top_k, emb_size]
+
+        # Apply gates to attention expert outputs
+        expert_outputs = expert_outputs * batch_gates[:, None]
+
+        # Ungroup and merge outputs to original order
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        layer_output = layer_output + self.bias
+        return layer_output
+
+    def forward(self, layer_input):
+        raise NotImplementedError("This module doesn't support call and forward.")
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->JetMoe
+class JetMoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        JetMoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->JetMoe
+class JetMoeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: JetMoeConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class JetMoeAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+    """
+
+    def __init__(self, config: JetMoeConfig, layer_idx: Optional[int] = None):
+        """
+        Initialize the JetMoeAttention module.
+
+        Args:
+            config:
+                Configuration object with model hyperparameters.
+            layer_idx:
+                Index of the layer in the model.
+        """
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.is_causal = True
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.top_k = config.num_experts_per_tok
+        self.attention_dropout = config.attention_dropout
+        self.kv_projection_size = config.kv_channels * config.num_key_value_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.kv_channels
+
+        self.experts = JetMoeMoA(config)
+
+        self.kv_proj = torch.nn.Linear(config.hidden_size, self.kv_projection_size * 2, bias=False)
+
+        self.rotary_emb = JetMoeRotaryEmbedding(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states, router_logits, topo_info = self.experts.map(hidden_states)
+        key_states, value_states = self.kv_proj(hidden_states).chunk(2, dim=-1)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads for top-k attention experts
+        key_states = key_states.repeat(1, self.top_k, 1, 1)
+        value_states = value_states.repeat(1, self.top_k, 1, 1)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.top_k, self.kv_projection_size)
+
+        attn_output = self.experts.reduce(attn_output, topo_info)
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, router_logits
+
+
+class JetMoeSdpaAttention(JetMoeAttention):
+    """
+    JetMoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `JetMoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from JetMoeAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]], Optional[torch.Tensor]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "JetMoeModel is using JetMoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states, router_logits, topo_info = self.experts.map(hidden_states)
+        key_states, value_states = self.kv_proj(hidden_states).chunk(2, dim=-1)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads for top-k attention experts
+        key_states = key_states.repeat(1, self.top_k, 1, 1)
+        value_states = value_states.repeat(1, self.top_k, 1, 1)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = causal_mask is None and q_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.top_k, self.kv_projection_size)
+
+        attn_output = self.experts.reduce(attn_output, topo_info)
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        return attn_output, None, router_logits
+
+
+class JetMoeFlashAttention2(JetMoeAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[
+        tuple[torch.Tensor, tuple[torch.Tensor]],
+        Optional[tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]]],
+    ]:
+        """
+        Forward pass of the JetMoeAttention module.
+
+        Args:
+            hidden_states (Optional[torch.FloatTensor]): Input hidden states.
+            attention_mask (Optional[torch.FloatTensor]): Attention mask.
+            layer_past (Optional[tuple[torch.Tensor]]): Past layer state.
+            use_cache (Optional[bool]): Whether to use cached states.
+            output_attentions (Optional[bool]): Whether to output attention weights.
+            cache_position (Optional[torch.LongTensor]): Position of the cache.
+
+        Returns:
+            Union[tuple[torch.Tensor, tuple[torch.Tensor]], Optional[tuple[...]]]: Tuple containing outputs.
+        """
+        output_attentions = False
+        bsz, q_len, hidden_size = hidden_states.size()
+
+        # calculate query, key, values
+        query_states, router_logits, topo_info = self.experts.map(hidden_states)
+        key_states, value_states = self.kv_proj(hidden_states).chunk(2, dim=-1)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads for top-k attention experts
+        key_states = key_states.repeat(1, self.top_k, 1, 1)
+        value_states = value_states.repeat(1, self.top_k, 1, 1)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.kv_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        ).to(input_dtype)
+
+        # output projection
+        attn_output = attn_output.reshape(bsz, q_len, self.top_k, self.kv_projection_size)
+        attn_output = self.experts.reduce(attn_output, topo_info)
+        attn_output = attn_output.view(bsz, q_len, hidden_size)  # re-assemble all head outputs side by side
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, router_logits
+
+
+JETMOE_ATTENTION_CLASSES = {
+    "eager": JetMoeAttention,
+    "flash_attention_2": JetMoeFlashAttention2,
+    "sdpa": JetMoeSdpaAttention,
+}
+
+
+class JetMoeBlock(GradientCheckpointingLayer):
+    def __init__(self, config: JetMoeConfig, layer_idx: Optional[int] = None):
+        """
+        Initialize the JetMoeBlock module.
+
+        Args:
+            config:
+                Configuration object with model hyperparameters.
+        """
+        super().__init__()
+        self.input_layernorm = JetMoeRMSNorm(config.hidden_size)
+        self.self_attention = JETMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.post_attention_layernorm = JetMoeRMSNorm(config.hidden_size)
+
+        self.mlp = JetMoeMoE(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]:
+        # Self Attention
+        attn_output, self_attn_weights, attn_router_logits = self.self_attention(
+            hidden_states=self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        hidden_states = hidden_states + attn_output
+        x_mlp, mlp_router_logits = self.mlp(self.post_attention_layernorm(hidden_states))
+        hidden_states = hidden_states + x_mlp
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += attn_router_logits, mlp_router_logits
+
+        return outputs
+
+
+@auto_docstring
+class JetMoePreTrainedModel(PreTrainedModel):
+    config: JetMoeConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["JetMoeBlock"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            # Slightly different from Mesh Transformer JAX which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, JetMoeRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, JetMoeParallelExperts):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, (JetMoeMoA, JetMoeMoE)):
+            module.bias.data.zero_()
+
+
+@auto_docstring
+class JetMoeModel(JetMoePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`JetMoeBlock`]
+
+    Args:
+        config:
+            JetMoeConfig
+    """
+
+    def __init__(self, config: JetMoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([JetMoeBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self._attn_implementation = config._attn_implementation
+        self.norm = JetMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> MoeModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            batch_size = inputs_embeds.shape[0]
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of JetMoe. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                output_router_logits=output_router_logits,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-2], layer_outputs[-1])
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class JetMoeForCausalLM(JetMoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = JetMoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.aux_loss_coef = config.aux_loss_coef
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.tie_word_embeddings = config.tie_word_embeddings
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+class JetMoeForSequenceClassification(GenericForSequenceClassification, JetMoePreTrainedModel): ...
+
+
+__all__ = ["JetMoeForCausalLM", "JetMoeModel", "JetMoePreTrainedModel", "JetMoeForSequenceClassification"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/__init__.py b/phivenv/Lib/site-packages/transformers/models/kosmos2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..51dbdaf40a1050b8ff624d86ed58cd07fcd9174f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_kosmos2 import *
+    from .modeling_kosmos2 import *
+    from .processing_kosmos2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ae102884148ef710bca034aef3846cbedfe06c5
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/configuration_kosmos2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/configuration_kosmos2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56b195f9a477c96a2a4c7f29f9b482c9377f3bb1
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/configuration_kosmos2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/modeling_kosmos2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/modeling_kosmos2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99916281cd50cf454de8301ac5b7d242c889e386
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/modeling_kosmos2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/processing_kosmos2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/processing_kosmos2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8b9666032017dbfe2df486d741e784914fc1088
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2/__pycache__/processing_kosmos2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/configuration_kosmos2.py b/phivenv/Lib/site-packages/transformers/models/kosmos2/configuration_kosmos2.py
new file mode 100644
index 0000000000000000000000000000000000000000..56b26eb171780f5a2f1a022ad991f726773e8bb7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2/configuration_kosmos2.py
@@ -0,0 +1,264 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""KOSMOS-2 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Kosmos2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2TextModel`]. It is used to instantiate a
+    KOSMOS-2 text decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2
+    [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65037):
+            Vocabulary size of the Kosmos2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Kosmos2Model`].
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the layers and the pooler layer.
+        layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(embed_dim).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Token id used for padding.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Token id used for beginning of string.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            Token id used for end of string.
+    ```"""
+
+    model_type = "kosmos_2_text_model"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "attention_heads",
+        "hidden_size": "embed_dim",
+        "num_hidden_layers": "layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=65037,
+        max_position_embeddings=2048,
+        embed_dim=2048,
+        layers=24,
+        ffn_dim=8192,
+        attention_heads=32,
+        activation_function="gelu",
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        layerdrop=0.0,
+        layer_norm_eps=1e-5,
+        init_std=0.02,
+        scale_embedding=True,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_dim = embed_dim
+        self.layers = layers
+        self.ffn_dim = ffn_dim
+        self.attention_heads = attention_heads
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.init_std = init_std
+        self.scale_embedding = scale_embedding
+        self.use_cache = use_cache
+
+
+class Kosmos2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2VisionModel`]. It is used to instantiate a
+    KOSMOS-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the KOSMOS-2
+    [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    ```"""
+
+    model_type = "kosmos_2_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+
+class Kosmos2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2Model`]. It is used to instantiate a
+    KOSMOS-2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the KOSMOS-2
+    [microsoft/kosmos-2-patch14-224](https://huggingface.co/microsoft/kosmos-2-patch14-224) architecture.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2VisionConfig`].
+        latent_query_num (`int`, *optional*, defaults to 64):
+            The number of latent query tokens that represent the image features used in the text decoder component.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import Kosmos2Config, Kosmos2Model
+
+    >>> # Initializing a Kosmos-2 kosmos-2-patch14-224 style configuration
+    >>> configuration = Kosmos2Config()
+
+    >>> # Initializing a model (with random weights) from the kosmos-2-patch14-224 style configuration
+    >>> model = Kosmos2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "kosmos-2"
+    sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        latent_query_num=64,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `Kosmos2TextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. Initializing the `Kosmos2VisionConfig` with default values.")
+
+        self.text_config = Kosmos2TextConfig(**text_config)
+        self.vision_config = Kosmos2VisionConfig(**vision_config)
+
+        self.latent_query_num = latent_query_num
+
+
+__all__ = ["Kosmos2Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/modeling_kosmos2.py b/phivenv/Lib/site-packages/transformers/models/kosmos2/modeling_kosmos2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6739022a3977cea7f4d12f30f50f97357f7df85a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2/modeling_kosmos2.py
@@ -0,0 +1,1864 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch KOSMOS-2 model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithCrossAttentions,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_kosmos2 import Kosmos2Config, Kosmos2TextConfig, Kosmos2VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+class Kosmos2ModelOutput(ModelOutput):
+    r"""
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+        encoder_sequence_length, embed_size_per_head)`.
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+        input) to speed up sequential decoding.
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`.
+
+        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
+        the weighted average in the self-attention heads.
+    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
+        The output of the [`Kosmos2VisionModel`].
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    projection_attentions: Optional[tuple[torch.FloatTensor]] = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Model output class for `Kosmos2ForConditionalGeneration`.
+    """
+)
+class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+        encoder_sequence_length, embed_size_per_head)`.
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+        input) to speed up sequential decoding.
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`.
+
+        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
+        the weighted average in the self-attention heads.
+    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
+        The output of the [`Kosmos2VisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    projection_attentions: Optional[tuple[torch.FloatTensor]] = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Kosmos2
+class Kosmos2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
+            )
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+# Adapted from transformers.models.siglip.modeling_siglip.eager_attention_forward -> Kosmos2 doesn't cast attn weights to fp32
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class Kosmos2VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation != "flash_attention_2":
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask
+        else:
+            self.is_causal = causal_attention_mask is not None
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Kosmos2Vision
+class Kosmos2VisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->Kosmos2Vision
+class Kosmos2VisionEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Kosmos2VisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Kosmos2VisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->Kosmos2Vision
+class Kosmos2VisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Kosmos2VisionEncoderLayer`].
+
+    Args:
+        config: Kosmos2VisionConfig
+    """
+
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Kosmos2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Similar to `transformers.models.clip.modeling_clip.CLIPVisionTransformer` but without docstring for `forward`
+class Kosmos2VisionTransformer(nn.Module):
+    # Copied from transformers.models.altclip.modeling_altclip.AltCLIPVisionTransformer.__init__ with AltCLIPVision->Kosmos2Vision,ALTCLIP_VISION->KOSMOS2_VISION,AltCLIP->Kosmos2Vision
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Kosmos2VisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = Kosmos2VisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Similar to `transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding` but allowing to pass `position_ids`
+class Kosmos2TextSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.__init__
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.make_weights
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.get_embedding
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            if position_ids is None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids, self.padding_idx, past_key_values_length
+                ).to(input_ids.device)
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            if position_ids is None:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+class KosmosTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Similar to transformers.models.bart.modeling_bart.BartAttention.__init__ except an additional `inner_attn_ln`.
+    def __init__(
+        self,
+        config,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: Optional[bool] = False,
+        add_inner_attn_layernorm: Optional[bool] = False,
+        bias: Optional[bool] = True,
+        layer_idx: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+        # End opy
+        self.inner_attn_ln = None
+        if add_inner_attn_layernorm:
+            self.inner_attn_ln = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = encoder_hidden_states is not None
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        if self.inner_attn_ln is not None:
+            attn_output = self.inner_attn_ln(attn_output)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class Kosmos2TextFFN(nn.Module):
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__()
+
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.fc1 = nn.Linear(config.embed_dim, config.ffn_dim)
+        self.fc2 = nn.Linear(config.ffn_dim, config.embed_dim)
+
+        self.ffn_layernorm = nn.LayerNorm(config.ffn_dim, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.ffn_layernorm(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        return hidden_states
+
+
+class Kosmos2TextBlock(GradientCheckpointingLayer):
+    def __init__(self, config: Kosmos2TextConfig, layer_idx=None):
+        super().__init__()
+        self.embed_dim = config.embed_dim
+
+        self.self_attn = KosmosTextAttention(
+            config,
+            embed_dim=self.embed_dim,
+            num_heads=config.attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            add_inner_attn_layernorm=True,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        if config.add_cross_attention:
+            self.encoder_attn = KosmosTextAttention(
+                config,
+                embed_dim=self.embed_dim,
+                num_heads=config.attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+                add_inner_attn_layernorm=False,
+                layer_idx=layer_idx,
+            )
+            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ffn = Kosmos2TextFFN(config)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "encoder_attn"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        # FFN
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        return outputs
+
+
+class Kosmos2TextTransformer(nn.Module):
+    """
+    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].
+
+    Args:
+        config: Kosmos2TextConfig
+    """
+
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__()
+        self.config = config
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+
+        self.embed_scale = math.sqrt(config.embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.embed_dim, padding_idx=config.pad_token_id)
+
+        self.embed_positions = Kosmos2TextSinusoidalPositionalEmbedding(
+            num_positions=config.max_position_embeddings,
+            embedding_dim=config.embed_dim,
+            padding_idx=config.pad_token_id,
+        )
+
+        self.layers = nn.ModuleList([Kosmos2TextBlock(config, layer_idx=i) for i in range(config.layers)])
+        self.layer_norm = nn.LayerNorm(config.embed_dim, config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward_embedding(
+        self,
+        input_ids,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        img_input_mask: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        # The argument `inputs_embeds` should be the one without being multiplied by `self.embed_scale`.
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if image_embeds is not None:
+            inputs_embeds[img_input_mask.to(dtype=torch.bool)] = image_embeds.to(inputs_embeds.device).view(
+                -1, image_embeds.size(-1)
+            )
+
+        inputs_embeds = inputs_embeds * self.embed_scale
+
+        # embed positions
+        positions = self.embed_positions(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            position_ids=position_ids,
+        )
+        positions = positions.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + positions
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = (
+                EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+                if encoder_hidden_states is not None
+                else DynamicCache(config=self.config)
+            )
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        # We don't need img info. when `past_key_values_length` > 0
+        if past_key_values_length > 0:
+            image_embeds = None
+            image_embeds_position_mask = None
+
+        hidden_states = self.forward_embedding(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            img_input_mask=image_embeds_position_mask,
+            past_key_values_length=past_key_values_length,
+            position_ids=position_ids,
+        )
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, hidden_states, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add final layer norm
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring
+class Kosmos2PreTrainedModel(PreTrainedModel):
+    config: Kosmos2Config
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Kosmos2VisionEncoderLayer", "Kosmos2TextBlock"]
+    _supports_attention_backend = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        if isinstance(self, Kosmos2VisionModel):
+            factor = self.config.initializer_factor
+        elif isinstance(self, (Kosmos2Model, Kosmos2ForConditionalGeneration)):
+            factor = self.config.vision_config.initializer_factor
+
+        if isinstance(self, (Kosmos2TextModel, Kosmos2TextForCausalLM)):
+            std = self.config.init_std
+        elif isinstance(self, (Kosmos2Model, Kosmos2ForConditionalGeneration)):
+            std = self.config.text_config.init_std
+
+        if isinstance(module, Kosmos2VisionEmbeddings):
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, Kosmos2VisionAttention):
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, Kosmos2VisionMLP):
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, KosmosTextAttention):
+            nn.init.normal_(module.q_proj.weight, std=std)
+            nn.init.normal_(module.k_proj.weight, std=std)
+            nn.init.normal_(module.v_proj.weight, std=std)
+            nn.init.normal_(module.out_proj.weight, std=std)
+        elif isinstance(module, Kosmos2TextFFN):
+            nn.init.normal_(module.fc1.weight, std=std)
+            nn.init.normal_(module.fc2.weight, std=std)
+        elif isinstance(module, Kosmos2TextForCausalLM):
+            nn.init.normal_(module.lm_head.weight, std=std)
+        elif isinstance(module, Kosmos2ImageToTextProjection):
+            nn.init.normal_(module.dense.weight, std=std)
+            nn.init.normal_(module.latent_query)
+        elif isinstance(module, Kosmos2TextTransformer):
+            module.embed_tokens.weight.data.normal_(mean=0.0, std=std)
+            if module.embed_tokens.padding_idx is not None:
+                module.embed_tokens.weight.data[module.embed_tokens.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class Kosmos2VisionModel(Kosmos2PreTrainedModel):
+    config: Kosmos2VisionConfig
+    main_input_name = "pixel_values"
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2,self.vision_model->self.model
+    def __init__(self, config: Kosmos2VisionConfig):
+        super().__init__(config)
+        self.model = Kosmos2VisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.get_input_embeddings with CLIP_VISION->KOSMOS2_VISION,CLIP->Kosmos2,self.vision_model->self.model
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embeddings.patch_embedding
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        return self.model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+
+class Kosmos2TextModel(Kosmos2PreTrainedModel):
+    config: Kosmos2TextConfig
+
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__(config)
+        self.model = Kosmos2TextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
+            1]`:
+
+            - 1 for places where to put the image features,
+            - 0 for places that are not for image features (i.e. for text tokens).
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        """
+        return self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
+    config: Kosmos2TextConfig
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Kosmos2TextConfig):
+        super().__init__(config)
+
+        self.model = Kosmos2TextTransformer(config)
+        self.lm_head = nn.Linear(in_features=config.embed_dim, out_features=config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
+            1]`:
+
+            - 1 for places where to put the image features,
+            - 0 for places that are not for image features (i.e. for text tokens).
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=lm_logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        image_embeds=None,
+        image_embeds_position_mask=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_position=None,
+        **model_kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        if cache_position[0] != 0:
+            image_embeds = None
+            image_embeds_position_mask = None
+
+        # appending `False` to `image_embeds_position_mask` (because `input_ids` grows during generation)
+        elif image_embeds_position_mask is not None:
+            batch_size, seq_len = inputs_embeds.size()[:-1] if inputs_embeds is not None else input_ids.size()
+            mask_len = image_embeds_position_mask.size()[-1]
+            image_embeds_position_mask = torch.cat(
+                (
+                    image_embeds_position_mask,
+                    torch.zeros(size=(batch_size, seq_len - mask_len), dtype=torch.bool, device=input_ids.device),
+                ),
+                dim=1,
+            )
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **model_kwargs,
+        )
+        # Kosmos2 has offset for position ids, so we need to create them correctly in PositionEmbedding layer
+        model_inputs.pop("position_ids", None)
+
+        return model_inputs
+
+
+class Kosmos2ImageToTextProjection(nn.Module):
+    """The layer that transforms the image model's output to part of the text model's input (namely, image features)"""
+
+    def __init__(self, config: Kosmos2Config):
+        super().__init__()
+        self.dense = nn.Linear(config.vision_config.hidden_size, config.text_config.embed_dim)
+        self.latent_query = nn.Parameter(torch.randn(config.latent_query_num, config.text_config.embed_dim))
+
+        self.x_attn = KosmosTextAttention(
+            config.text_config,
+            config.text_config.embed_dim,
+            config.text_config.attention_heads,
+            dropout=config.text_config.attention_dropout,
+            is_decoder=False,
+            add_inner_attn_layernorm=False,
+        )
+
+    def forward(self, features):
+        hidden_states = self.dense(features)
+
+        # shape = [batch, latent_query_num, h_dim]
+        latent_query = self.latent_query.unsqueeze(0).expand(hidden_states.size(0), -1, -1)
+        key_value_states = torch.cat([hidden_states, latent_query], dim=1)
+
+        hidden_states, attn_weights = self.x_attn(
+            hidden_states=latent_query,
+            encoder_hidden_states=key_value_states,
+            past_key_values=None,
+            attention_mask=None,
+            output_attentions=None,
+        )
+
+        return hidden_states, attn_weights
+
+
+@auto_docstring(
+    custom_intro="""
+    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
+    """
+)
+class Kosmos2Model(Kosmos2PreTrainedModel):
+    config: Kosmos2Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Kosmos2Config):
+        super().__init__(config)
+
+        self.text_model = Kosmos2TextModel(config.text_config)
+        self.vision_model = Kosmos2VisionModel(config.vision_config)
+        self.image_to_text_projection = Kosmos2ImageToTextProjection(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_model.model.embed_tokens = value
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        return_attentions: Optional[bool] = False,
+        interpolate_pos_encoding: Optional[bool] = False,
+    ):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            return_attentions (`bool`, *optional*, defaults to `False`):
+                Whether to return `projection_attentions` or not.
+            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+                Whether to interpolate positional embeddings or not.
+        """
+        vision_model_output = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
+        image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0])
+        # normalized features
+        image_embeds = nn.functional.normalize(image_embeds, dim=-1)
+        image_embeds, projection_attentions = self.image_to_text_projection(image_embeds)
+
+        if return_attentions:
+            return image_embeds, projection_attentions
+        return image_embeds
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Kosmos2ModelOutput]:
+        r"""
+        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
+            1]`:
+
+            - 1 for places where to put the image features,
+            - 0 for places that are not for image features (i.e. for text tokens).
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Kosmos2Model
+
+        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
+
+        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = (
+        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
+        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
+        ...     "</object>"
+        ... )
+
+        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)
+
+        >>> last_hidden_state = model(
+        ...     pixel_values=inputs["pixel_values"],
+        ...     input_ids=inputs["input_ids"],
+        ...     attention_mask=inputs["attention_mask"],
+        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
+        ... ).last_hidden_state
+        >>> list(last_hidden_state.shape)
+        [1, 91, 2048]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_model_output = None
+        projection_attentions = None
+        if image_embeds is None:
+            if pixel_values is None:
+                raise ValueError("You have to specify either `pixel_values` or `image_embeds`.")
+            image_embeds, projection_attentions = self.get_image_features(
+                pixel_values, return_attentions=True, interpolate_pos_encoding=interpolate_pos_encoding
+            )
+
+        outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            **kwargs,
+        )
+
+        return Kosmos2ModelOutput(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_embeds=image_embeds,
+            projection_attentions=projection_attentions,
+            vision_model_output=vision_model_output,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
+    language model.
+    """
+)
+class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
+    config: Kosmos2Config
+    main_input_name = "pixel_values"
+    _tied_weights_keys = ["text_model.lm_head.weight"]
+
+    def __init__(self, config: Kosmos2Config):
+        super().__init__(config)
+
+        self.text_model = Kosmos2TextForCausalLM(config.text_config)
+        self.vision_model = Kosmos2VisionModel(config.vision_config)
+
+        self.image_to_text_projection = Kosmos2ImageToTextProjection(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_model.model.embed_tokens = value
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.text_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.text_model.set_output_embeddings(new_embeddings)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Kosmos2ForConditionalGenerationModelOutput]:
+        r"""
+        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
+            1]`:
+
+            - 1 for places where to put the image features,
+            - 0 for places that are not for image features (i.e. for text tokens).
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
+
+        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
+
+        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> prompt = "<grounding> An image of"
+
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+        >>> generated_ids = model.generate(
+        ...     pixel_values=inputs["pixel_values"],
+        ...     input_ids=inputs["input_ids"],
+        ...     attention_mask=inputs["attention_mask"],
+        ...     image_embeds=None,
+        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
+        ...     use_cache=True,
+        ...     max_new_tokens=64,
+        ... )
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
+        >>> processed_text
+        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'
+
+        >>> caption, entities = processor.post_process_generation(generated_text)
+        >>> caption
+        'An image of a snowman warming himself by a fire.'
+
+        >>> entities
+        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_model_output = None
+        projection_attentions = None
+        if image_embeds is None:
+            if pixel_values is None:
+                raise ValueError("You have to specify either `pixel_values` or `image_embeds`.")
+
+            vision_model_output = self.vision_model(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
+            image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0])
+            # normalized features
+            image_embeds = nn.functional.normalize(image_embeds, dim=-1)
+            image_embeds, projection_attentions = self.image_to_text_projection(image_embeds)
+
+        lm_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            **kwargs,
+        )
+
+        return Kosmos2ForConditionalGenerationModelOutput(
+            loss=lm_outputs.loss,
+            logits=lm_outputs.logits,
+            past_key_values=lm_outputs.past_key_values,
+            hidden_states=lm_outputs.hidden_states,
+            attentions=lm_outputs.attentions,
+            image_embeds=image_embeds,
+            projection_attentions=projection_attentions,
+            vision_model_output=vision_model_output,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        # in order to allow `inputs` argument (as in `GenerationMixin`)
+        inputs = kwargs.pop("inputs", None)
+        if pixel_values is not None and inputs is not None:
+            raise ValueError(
+                f"`inputs`: {inputs} were passed alongside `pixel_values` which is not allowed."
+                f"Make sure to either pass `inputs` or pixel_values=..."
+            )
+        if pixel_values is None and inputs is not None:
+            pixel_values = inputs
+
+        if image_embeds is None:
+            vision_model_output = self.vision_model(pixel_values)
+            # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
+            image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0])
+            # normalized features
+            image_embeds = nn.functional.normalize(image_embeds, dim=-1)
+            image_embeds, projection_attentions = self.image_to_text_projection(image_embeds)
+
+        output = self.text_model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+        return output
+
+
+__all__ = ["Kosmos2ForConditionalGeneration", "Kosmos2Model", "Kosmos2PreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2/processing_kosmos2.py b/phivenv/Lib/site-packages/transformers/models/kosmos2/processing_kosmos2.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6f63e2e6181242f624d69f5de7a4e55d321d50
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2/processing_kosmos2.py
@@ -0,0 +1,696 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for KOSMOS-2."""
+
+import copy
+import math
+import re
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput, is_batched
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_base import BatchEncoding, TextInput
+
+
+BboxInput = Union[
+    list[tuple[int, int]],
+    list[tuple[float, float, float, float]],
+    list[list[tuple[int, int]]],
+    list[list[tuple[float, float, float]]],
+]
+
+
+class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
+    bboxes: Optional[list[float]]
+    num_image_tokens: Optional[int]
+    first_image_token_id: Optional[int]
+
+
+class Kosmos2TextKwargs(TextKwargs, total=False):
+    add_eos_token: Optional[bool]
+
+
+class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: Kosmos2TextKwargs
+    images_kwargs: Kosmos2ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "verbose": True,
+            "add_eos_token": False,
+        },
+        "images_kwargs": {
+            "num_image_tokens": 64,
+        },
+    }
+
+
+class Kosmos2Processor(ProcessorMixin):
+    r"""
+    Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
+    processor.
+
+    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
+    [`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
+    for more information.
+
+    Args:
+        image_processor (`CLIPImageProcessor`):
+            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
+        tokenizer (`XLMRobertaTokenizerFast`):
+            An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
+        num_patch_index_tokens (`int`, *optional*, defaults to 1024):
+            The number of tokens that represent patch indices.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
+        tokenizer.return_token_type_ids = False
+
+        self.eod_token = "</doc>"
+
+        self.boi_token = "<image>"
+        self.eoi_token = "</image>"
+
+        self.eoc_token = "</chunk>"
+        self.eol_token = "</line>"
+
+        self.bop_token = "<phrase>"
+        self.eop_token = "</phrase>"
+
+        self.boo_token = "<object>"
+        self.eoo_token = "</object>"
+
+        self.dom_token = "</delimiter_of_multi_objects/>"
+
+        self.grd_token = "<grounding>"
+
+        self.tag_tokens = [
+            self.eod_token,
+            self.boi_token,
+            self.eoi_token,
+            self.eoc_token,
+            self.eol_token,
+            self.bop_token,
+            self.eop_token,
+            self.boo_token,
+            self.eoo_token,
+            self.dom_token,
+            self.grd_token,
+        ]
+
+        self.num_patch_index_tokens = num_patch_index_tokens
+        patch_index_tokens = [f"<patch_index_{str(x).zfill(4)}>" for x in range(self.num_patch_index_tokens)]
+
+        tokens_to_add = []
+        for token in self.tag_tokens + patch_index_tokens:
+            tokens_to_add.append(AddedToken(token, lstrip=True, rstrip=False, normalized=False))
+        tokenizer.add_tokens(tokens_to_add)
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, list[TextInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Kosmos2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+
+        The rest of this documentation shows the arguments specific to `Kosmos2Processor`.
+
+        Args:
+            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
+                The bounding bboxes associated to `texts`.
+            num_image_tokens (`int`, *optional* defaults to 64):
+                The number of (consecutive) places that are used to mark the placeholders to store image information.
+                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
+            first_image_token_id (`int`, *optional*):
+                The token id that will be used for the first place of the subsequence that is reserved to store image
+                information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
+            add_eos_token (`bool`, defaults to `False`):
+                Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        output_kwargs = self._merge_kwargs(
+            Kosmos2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        bboxes = output_kwargs["images_kwargs"].pop("bboxes", None)
+        num_image_tokens = output_kwargs["images_kwargs"].pop("num_image_tokens", 64)
+        first_image_token_id = output_kwargs["images_kwargs"].pop("first_image_token_id", None)
+        add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
+
+        add_special_tokens = output_kwargs["text_kwargs"]["add_special_tokens"]
+        padding = output_kwargs["text_kwargs"]["padding"]
+        return_tensors = output_kwargs["text_kwargs"].setdefault("return_tensors", None)
+
+        encoding = BatchFeature()
+
+        if images is not None:
+            image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
+            encoding.update(image_encoding)
+
+        if text is not None:
+            text = self.preprocess_examples(text, images, bboxes, num_image_tokens=num_image_tokens)
+
+            if add_special_tokens and not add_eos_token:
+                if isinstance(text, str):
+                    text = f"{self.tokenizer.bos_token}{text}"
+                elif isinstance(text, list):
+                    text = [f"{self.tokenizer.bos_token}{s}" for s in text]
+            output_kwargs["text_kwargs"]["add_special_tokens"] = (
+                output_kwargs["text_kwargs"]["add_special_tokens"] and add_eos_token
+            )
+            output_kwargs["text_kwargs"]["padding"] = padding if images is None else False
+            output_kwargs["text_kwargs"]["return_tensors"] = return_tensors if images is None else None
+            text_encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            encoding.update(text_encoding)
+
+        output_kwargs["text_kwargs"]["add_special_tokens"] = add_special_tokens
+        output_kwargs["text_kwargs"]["padding"] = padding
+        output_kwargs["text_kwargs"]["return_tensors"] = return_tensors
+
+        if text is not None and images is not None:
+            # Use the id of the first token after <unk>
+            if first_image_token_id is None:
+                first_image_token_id = self.tokenizer.unk_token_id + 1
+
+            # To see if we need one more `0` (for `<s>`) at the beginning of `image_embeds_position_mask`.
+            with_bos = add_special_tokens
+
+            # The first (actual) `<image>` token is always at the 1st or 2nd place (after `<s>` if any). Here we look
+            # for the second `<image>` token (which indicate the first image token).
+            start_index = int(with_bos) + 1
+
+            # Add `image_embeds_position_mask`: the leading and trailing `0` are for `boi` and `eoi` tokens. The `1` indicates
+            # the places of image tokens.
+            image_token_ids = list(range(first_image_token_id, first_image_token_id + num_image_tokens))
+            base_image_embeds_position_mask = [0] + [1] * num_image_tokens + [0]
+
+            # loop over `encoding["input_ids"]`
+            input_ids = []
+            image_embeds_position_mask = []
+            all_input_ids = encoding["input_ids"]
+            # not batched -> (changed to) batch of size 1
+            if isinstance(text, str):
+                all_input_ids = [all_input_ids]
+                encoding["attention_mask"] = [encoding["attention_mask"]]
+            for text_ids in all_input_ids:
+                # change the ids for the fake `<image>` tokens in `input_ids`
+                text_ids = text_ids[:start_index] + image_token_ids + text_ids[start_index + num_image_tokens :]
+                input_ids.append(text_ids)
+
+                mask = copy.copy(base_image_embeds_position_mask)
+                if with_bos:
+                    # for `<s>`
+                    mask = [0] + mask
+                # trailing part (which are not related to the image)
+                mask += [0] * (len(text_ids) - len(mask))
+                image_embeds_position_mask.append(mask)
+
+            if isinstance(text, list):
+                sorted_length = sorted(
+                    [(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)], key=lambda x: x[-1]
+                )
+                _, min_len_not_padded = sorted_length[0]
+                idx, _ = sorted_length[-1]
+                output_kwargs["text_kwargs"]["add_special_tokens"] = (
+                    output_kwargs["text_kwargs"]["add_special_tokens"] and add_eos_token
+                )
+                output_kwargs["text_kwargs"]["return_tensors"] = None
+
+                text_encoding = self.tokenizer(text=[text[idx]], **output_kwargs["text_kwargs"])
+                max_len_padded = len(text_encoding.input_ids[0])
+
+                if min_len_not_padded != max_len_padded:
+                    if self.tokenizer.padding_side == "right":
+                        input_ids = [x + [self.tokenizer.pad_token_id] * (max_len_padded - len(x)) for x in input_ids]
+                        image_embeds_position_mask = [
+                            x + [0] * (max_len_padded - len(x)) for x in image_embeds_position_mask
+                        ]
+                        encoding["attention_mask"] = [
+                            x + [0] * (max_len_padded - len(x)) for x in encoding["attention_mask"]
+                        ]
+                    elif self.tokenizer.padding_side == "left":
+                        input_ids = [[self.tokenizer.pad_token_id] * (max_len_padded - len(x)) + x for x in input_ids]
+                        image_embeds_position_mask = [
+                            [0] * (max_len_padded - len(x)) + x for x in image_embeds_position_mask
+                        ]
+                        encoding["attention_mask"] = [
+                            [0] * (max_len_padded - len(x)) + x for x in encoding["attention_mask"]
+                        ]
+
+            # un-batch if necessary
+            if isinstance(text, str) and return_tensors is None:
+                input_ids = input_ids[0]
+                encoding["attention_mask"] = encoding["attention_mask"][0]
+                image_embeds_position_mask = image_embeds_position_mask[0]
+
+            # update (with the target tensor type if specified)
+            encoding.update(
+                BatchEncoding(
+                    data={
+                        "input_ids": input_ids,
+                        "attention_mask": encoding["attention_mask"],
+                        "image_embeds_position_mask": image_embeds_position_mask,
+                    },
+                    tensor_type=return_tensors,
+                )
+            )
+
+        return encoding
+
+    def _check_bboxes_for_single_text(self, bboxes):
+        """
+        Check `bboxes` for a single text example. It could be
+            - `None`: no bounding box associated to a text.
+            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
+              in a text. This could be:
+                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
+                  - A tuple of 2 integers: A single bounding box specified by patch indices.
+                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
+                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
+                   `<phrase> ... </phrase>` pair.
+        """
+        if bboxes is None:
+            return
+        elif not isinstance(bboxes, list):
+            raise ValueError("`bboxes` (for a single text example) should be `None` or a list.")
+
+        # `bbox` is the bounding boxes for a single <phrase> </phrase> pair
+        for bbox in bboxes:
+            if bbox is None:
+                continue
+            elif not isinstance(bbox, list):
+                bbox = [bbox]
+            for element in bbox:
+                if not isinstance(element, tuple) or not (
+                    (len(element) == 2 and all(isinstance(x, int) for x in element))
+                    or (len(element) == 4 and all(isinstance(x, float) for x in element))
+                ):
+                    raise ValueError(
+                        "Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing "
+                        "2 integers or 4 float point numbers, or a list containing such tuples. Also "
+                        "make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in "
+                        "batches or both for a single example."
+                    )
+
+    def _preprocess_single_example(self, text, image, bboxes, img_info_tokens):
+        text = text.strip()
+        if image is not None:
+            # Add `<image> ... (fake) image tokens ... </image>`
+            text = f"{img_info_tokens} {text}"
+
+        # Add `<object> <patch_idx_xxxx> <patch_idx_yyy> </object>` after `<phrase> phrase text </phrase>`
+        text = self._insert_patch_index_tokens(text, bboxes)
+        return text
+
+    def preprocess_examples(
+        self,
+        texts: Union[TextInput, list[TextInput]],
+        images: ImageInput = None,
+        bboxes: BboxInput = None,
+        num_image_tokens: Optional[int] = 64,
+    ) -> Union[str, list[str]]:
+        """Add image and bounding box information to `texts` as image and patch index tokens.
+
+        Args:
+            texts (`Union[TextInput, list[TextInput]]`): The texts to be processed.
+            images (`ImageInput`, *optional*): The images associated to `texts`.
+            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
+                The bounding bboxes associated to `texts`.
+            num_image_tokens (`int`, *optional*, defaults to 64):
+                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
+                attribute in `Kosmos2Config`.
+
+        Returns:
+            `Union[TextInput, list[TextInput]]`: The processed texts with image and patch index tokens.
+        """
+        # These are fake `<image>` tokens enclosed between (the actual) `<image>` token and `</image>`.
+        img_tokens = [self.boi_token] * num_image_tokens
+        img_info_tokens = " ".join([self.boi_token] + img_tokens + [self.eoi_token])
+
+        # make batch to simplify processing logic
+        batched = True
+        if isinstance(texts, str):
+            batched = False
+            texts = [texts]
+
+        if images is None:
+            images = [None] * len(texts)
+        elif not is_batched(images):
+            images = [images]
+        if len(texts) != len(images):
+            raise ValueError(
+                f"The number of examples in `texts` and `images` should be the same. Got {len(texts)} v.s. {len(images)} instead."
+            )
+
+        if not batched:
+            self._check_bboxes_for_single_text(bboxes)
+            bboxes = [bboxes]
+        elif bboxes is not None:
+            if not isinstance(bboxes, list):
+                raise ValueError("`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.")
+            for x in bboxes:
+                self._check_bboxes_for_single_text(x)
+        else:
+            bboxes = [None] * len(texts)
+
+        if len(bboxes) != len(texts):
+            raise ValueError(
+                f"The number of examples in `texts` and `bboxes` should be the same. Got {len(texts)} v.s. {len(bboxes)} instead."
+            )
+
+        result = [
+            self._preprocess_single_example(text, image, bbox, img_info_tokens)
+            for text, image, bbox in zip(texts, images, bboxes)
+        ]
+        # un-batch if necessary
+        if not batched:
+            result = result[0]
+
+        return result
+
+    def post_process_generation(self, text, cleanup_and_extract=True):
+        caption = text.split(self.eoi_token)[-1]
+        if cleanup_and_extract:
+            return clean_text_and_extract_entities_with_bboxes(caption)
+        return caption
+
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        generated_texts = self.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
+        return [self.post_process_generation(text, cleanup_and_extract=False) for text in generated_texts]
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return tokenizer_input_names + image_processor_input_names + ["image_embeds_position_mask"]
+
+    def _insert_patch_index_tokens(self, text: str, bboxes: Union[list[tuple[int]], list[tuple[float]]]) -> str:
+        if bboxes is None or len(bboxes) == 0:
+            return text
+
+        matched_phrases = list(re.finditer(r"<phrase>.+?</phrase>", string=text))
+        if len(matched_phrases) != len(bboxes):
+            raise ValueError(
+                f"The number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got {len(matched_phrases)} v.s. {len(bboxes)} instead."
+            )
+
+        # insert object's patch index tokens
+        # the found `<phrase> ... </phrase>` pairs.
+        curr_pos = 0
+        buffer = []
+        for matched, bbox in zip(matched_phrases, bboxes):
+            _, end = matched.span()
+            buffer.append(text[curr_pos:end])
+            curr_pos = end
+            # A phrase without bbox
+            if bbox is None:
+                continue
+            # A phrase with a single bbox
+            if isinstance(bbox, tuple):
+                bbox = [bbox]
+            patch_index_strings = []
+            # A phrase could have multiple bboxes
+            if not all(box is not None for box in bbox):
+                raise ValueError(
+                    "The multiple bounding boxes for a single phrase should not contain any `None` value."
+                )
+            for box in bbox:
+                patch_index_1, patch_index_2 = self._convert_bbox_to_patch_index_tokens(box)
+                patch_index_strings.append(f"{patch_index_1} {patch_index_2}")
+            # `bbox` being an empty list
+            if len(patch_index_strings) == 0:
+                continue
+            position_str = " </delimiter_of_multi_objects/> ".join(patch_index_strings)
+            buffer.append(f"<object> {position_str} </object>")
+        # remaining
+        if curr_pos < len(text):
+            buffer.append(text[curr_pos:])
+
+        text = "".join(buffer)
+        return text
+
+    def _convert_bbox_to_patch_index_tokens(
+        self, bbox: Union[tuple[int, int], tuple[float, float, float, float]]
+    ) -> tuple[str, str]:
+        # already computed patch indices
+        if len(bbox) == 2:
+            idx_1, idx_2 = bbox
+        # bbox specified with (normalized) coordinates
+        else:
+            # use `self.tokenizer` to get `num_patches_per_side`
+            num_patches_per_side = int(math.sqrt(self.num_patch_index_tokens))
+            idx_1, idx_2 = coordinate_to_patch_index(bbox, num_patches_per_side)
+
+        token_1 = f"<patch_index_{str(idx_1).zfill(4)}>"
+        token_2 = f"<patch_index_{str(idx_2).zfill(4)}>"
+
+        return token_1, token_2
+
+
+def coordinate_to_patch_index(bbox: tuple[float, float, float, float], num_patches_per_side: int) -> tuple[int, int]:
+    """Convert a bounding box to a pair of patch indices.
+
+    Args:
+        bbox (`tuple[float, float, float, float]`):
+            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
+            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
+        num_patches_per_side (`int`): the number of patches along each side.
+
+    Returns:
+        `tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
+    """
+    (x1, y1, x2, y2) = bbox
+
+    if not (x2 > x1 and y2 > y1):
+        raise ValueError("The coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.")
+
+    ul_x = math.floor(x1 * num_patches_per_side)
+    ul_y = math.floor(y1 * num_patches_per_side)
+
+    lr_x = math.ceil(x2 * num_patches_per_side - 1)
+    lr_y = math.ceil(y2 * num_patches_per_side - 1)
+
+    ul_idx = ul_y * num_patches_per_side + ul_x
+    lr_idx = lr_y * num_patches_per_side + lr_x
+
+    return ul_idx, lr_idx
+
+
+# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L35C1-L75C38
+# (with format modifications)
+def patch_index_to_coordinate(ul_idx: int, lr_idx: int, num_patches_per_side: int):
+    """
+    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
+    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
+
+    Args:
+        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
+        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
+        num_patches_per_side (`int`): the number of patches along each side.
+
+    Returns:
+        `tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
+    """
+    # Compute the size of each cell in the grid
+    cell_size = 1.0 / num_patches_per_side
+
+    # Compute the x and y indices of the upper-left and lower-right corners of the bounding box
+    ul_x = ul_idx % num_patches_per_side
+    ul_y = ul_idx // num_patches_per_side
+
+    lr_x = lr_idx % num_patches_per_side
+    lr_y = lr_idx // num_patches_per_side
+
+    # Compute the normalized coordinates of the bounding box
+    if ul_idx == lr_idx:
+        x1 = ul_x * cell_size
+        y1 = ul_y * cell_size
+        x2 = lr_x * cell_size + cell_size
+        y2 = lr_y * cell_size + cell_size
+    elif ul_x == lr_x or ul_y == lr_y:
+        x1 = ul_x * cell_size
+        y1 = ul_y * cell_size
+        x2 = lr_x * cell_size + cell_size
+        y2 = lr_y * cell_size + cell_size
+    else:
+        x1 = ul_x * cell_size + cell_size / 2
+        y1 = ul_y * cell_size + cell_size / 2
+        x2 = lr_x * cell_size + cell_size / 2
+        y2 = lr_y * cell_size + cell_size / 2
+
+    return x1, y1, x2, y2
+
+
+# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L4-L33
+# (with format modifications)
+def extract_entities_with_patch_indices(text):
+    """Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.
+
+    This functioin is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
+    processing happens, including converting to normalized coordinates and whitespace character cleaning up.
+
+    Examples:
+
+    ```python
+    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
+    >>> entities = extract_entities_with_patch_indices(text)
+    >>> entities
+    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
+    ```"""
+    # The regular expression pattern for matching the required formats
+    pattern = r"(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>"
+
+    # Find all matches in the given string
+    matches = re.finditer(pattern, text)
+
+    # Initialize an empty list to store the valid patch_index combinations
+    entities_with_patch_indices = []
+
+    for match in matches:
+        # span of a `phrase` that is between <phrase> and </phrase>
+        span = match.span(2)
+        phrase_tag, phrase, match_content = match.groups()
+        if not phrase_tag:
+            phrase = None
+            # We take the starting position of `<object>`
+            span = (match.span(0)[0], match.span(0)[0])
+
+        # Split the match_content by the delimiter to get individual patch_index pairs
+        patch_index_pairs = match_content.split("</delimiter_of_multi_objects/>")
+
+        entity_bboxes = []
+        for pair in patch_index_pairs:
+            # Extract the xxxx and yyyy values from the patch_index pair
+            x = re.search(r"<patch_index_(\d+)>", pair)
+            y = re.search(r"<patch_index_(\d+)>", pair[1:])
+
+            if x and y:
+                if phrase:
+                    entity_bboxes.append((int(x.group(1)), int(y.group(1))))
+                else:
+                    entity_bboxes.append((int(x.group(1)), int(y.group(1))))
+
+        if phrase:
+            entities_with_patch_indices.append((phrase, span, entity_bboxes))
+        else:
+            for bbox in entity_bboxes:
+                # fake entity name
+                entity = f"<patch_index_{bbox[0]}><patch_index_{bbox[1]}>"
+                entities_with_patch_indices.append((entity, span, [bbox]))
+
+    return entities_with_patch_indices
+
+
+def adjust_entity_positions(entity, text):
+    """Adjust the positions of the entities in `text` to be relative to the text with special fields removed."""
+    entity_name, (start, end) = entity
+    # computed the length of strings with special fields (tag tokens, patch index tokens, etc.) removed
+    adjusted_start = len(re.sub("<.*?>", "", text[:start]))
+    adjusted_end = len(re.sub("<.*?>", "", text[:end]))
+    adjusted_entity = (entity_name, (adjusted_start, adjusted_end))
+    return adjusted_entity
+
+
+def _cleanup_spaces(text, entities):
+    """Remove the spaces around the text and the entities in it."""
+    new_text = text.strip()
+    leading_spaces = len(text) - len(text.lstrip())
+
+    new_entities = []
+    for entity_name, (start, end), bboxes in entities:
+        entity_name_leading_spaces = len(entity_name) - len(entity_name.lstrip())
+        entity_name_trailing_spaces = len(entity_name) - len(entity_name.rstrip())
+
+        start = start - leading_spaces + entity_name_leading_spaces
+        end = end - leading_spaces - entity_name_trailing_spaces
+        entity_name = entity_name.strip()
+
+        new_entities.append((entity_name, (start, end), bboxes))
+
+    return new_text, new_entities
+
+
+# copied from https://github.com/microsoft/unilm/blob/97e4923e97d3ee10b57e97013556e3fd0d207a9b/kosmos-2/demo/decode_string.py#L77-L87
+# (with format modifications)
+def clean_text_and_extract_entities_with_bboxes(text, num_patches_per_side=32):
+    """Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.
+
+    Examples:
+
+    ```python
+    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
+    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
+    >>> clean_text
+    'An image of a snowman warming himself by a fire.'
+
+    >>> entities
+    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
+    ```"""
+    # remove special fields (tag tokens, patch index tokens, etc.)
+    processed_text = re.sub("<.*?>", "", text)
+
+    entities_with_patch_indices = extract_entities_with_patch_indices(text)
+    entities = []
+    for item in entities_with_patch_indices:
+        entity, bboxes = item[0:2], item[2]
+        adjusted_entity = adjust_entity_positions(entity, text)
+        bboxes_in_coords = [patch_index_to_coordinate(bbox[0], bbox[1], num_patches_per_side) for bbox in bboxes]
+
+        entities.append(adjusted_entity + (bboxes_in_coords,))
+
+    return _cleanup_spaces(processed_text, entities)
+
+
+__all__ = ["Kosmos2Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__init__.py b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..69ff528783bc11b0f00df15eeee2bb474ffa5a49
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__init__.py
@@ -0,0 +1,31 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_kosmos2_5 import *
+    from .image_processing_kosmos2_5 import *
+    from .image_processing_kosmos2_5_fast import *
+    from .modeling_kosmos2_5 import *
+    from .processing_kosmos2_5 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d40511d96dd4fcd361999becc1c9ab3e2cf5709
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa39cd562f2bf637036ddbc8601a28a62245ac9f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..158bc005869974ae562ea13d501005e24619dcf4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94c6dff5eebac5111b4a40ce098b3fbb610f514c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69655fec423f6d2546cdb5162ca46ec39163e0fb
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4329880051ea6ba644b02de336cc19e6233eab17
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/configuration_kosmos2_5.py b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/configuration_kosmos2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc3bd586959a0cb4d9c147c0623f55d58d09037
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/configuration_kosmos2_5.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""KOSMOS-2.5 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Kosmos2_5TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2_5TextModel`]. It is used to instantiate a
+    KOSMOS-2.5 text decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2.5
+    [microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 108481):
+            Vocabulary size of the Kosmos2_5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Kosmos2_5Model`].
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the layers and the pooler layer.
+        layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        ffn_dim (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://huggingface.co/papers/1909.11556) for more details.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(embed_dim).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    ```"""
+
+    model_type = "kosmos_2_5_text_model"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "attention_heads",
+        "hidden_size": "embed_dim",
+        "num_hidden_layers": "layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=108481,
+        max_position_embeddings=4096,
+        embed_dim=1536,
+        layers=24,
+        ffn_dim=6144,
+        attention_heads=16,
+        activation_function="gelu",
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        layerdrop=0.0,
+        layer_norm_eps=1e-5,
+        init_std=0.02,
+        scale_embedding=True,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_dim = embed_dim
+        self.layers = layers
+        self.ffn_dim = ffn_dim
+        self.attention_heads = attention_heads
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.init_std = init_std
+        self.scale_embedding = scale_embedding
+        self.use_cache = use_cache
+
+
+class Kosmos2_5VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2_5VisionModel`]. It is used to
+    instantiate a KOSMOS-2.5 vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the vision encoder of the KOSMOS-2.5
+    [microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        patch_embed_hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the input patch_embedding layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3968):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        head_dim (`int`, *optional*, defaults to 64):
+            Dimensionality of the key, query, value projections per attention head.
+        num_hidden_layers (`int`, *optional*, defaults to 18):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 24):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_num_patches (`int`, *optional*, defaults to 4096):
+            Maximum sequence length (here number of patches) supported by the model.
+    Example:
+
+    ```python
+    >>> from transformers import Kosmos2_5VisionConfig, Kosmos2_5VisionModel
+
+    >>> # Initializing a Kosmos2_5VisionConfig with microsoft/kosmos-2.5 style configuration
+    >>> configuration = Kosmos2_5VisionConfig()
+
+    >>> # Initializing a Kosmos2_5VisionModel (with random weights) from the microsoft/kosmos-2.5 style configuration
+    >>> model = Kosmos2_5VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "kosmos_2_5_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1536,
+        patch_embed_hidden_size=768,
+        intermediate_size=3968,
+        head_dim=64,
+        num_hidden_layers=18,
+        num_attention_heads=24,
+        dense_act_fn="gelu_new",
+        layer_norm_eps=1e-6,
+        dropout_rate=0.0,
+        attention_dropout=0.0,
+        max_num_patches=4096,
+        initializer_factor=1.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.patch_embed_hidden_size = patch_embed_hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout_rate = dropout_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.dense_act_fn = dense_act_fn
+        self.max_num_patches = max_num_patches
+        self.head_dim = head_dim
+        self.initializer_factor = initializer_factor
+        self.initializer_range = initializer_range
+
+
+class Kosmos2_5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Kosmos2_5Model`]. It is used to instantiate a
+    KOSMOS-2.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the KOSMOS-2.5
+    [microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2_5TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Kosmos2_5VisionConfig`].
+        latent_query_num (`int`, *optional*, defaults to 2048):
+            The number of latent query tokens that represent the image features used in the text decoder component.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = "kosmos-2.5"
+    sub_configs = {"text_config": Kosmos2_5TextConfig, "vision_config": Kosmos2_5VisionConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        latent_query_num=2048,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the Kosmos2_5TextConfig with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. Initializing the Kosmos2_5VisionConfig with default values.")
+
+        self.text_config = Kosmos2_5TextConfig(**text_config)
+        self.vision_config = Kosmos2_5VisionConfig(**vision_config)
+
+        self.latent_query_num = latent_query_num
+
+
+__all__ = ["Kosmos2_5Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/image_processing_kosmos2_5.py b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..e679050d337f251d4cdd7db199f1d53c6af686c7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
@@ -0,0 +1,353 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Kosmos2_5."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import (
+    convert_to_rgb,
+    normalize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available, logging
+from ...utils.import_utils import requires_backends
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+DEFAULT_FONT_PATH = "ybelkada/fonts"
+
+
+# Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
+def torch_extract_patches(image_tensor, patch_height, patch_width):
+    """
+    Utiliy function to extract patches from a given image tensor. Returns a tensor of shape
+    (1, `rows`, `columns`, `num_channels`x `patch_height` x `patch_width`).
+
+    Args:
+        image_tensor (torch.Tensor):
+            The image tensor to extract patches from.
+        patch_height (int):
+            The height of the patches to extract.
+        patch_width (int):
+            The width of the patches to extract.
+    """
+    requires_backends(torch_extract_patches, ["torch"])
+
+    image_tensor = image_tensor.unsqueeze(0)
+    patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
+    patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
+    patches = patches.permute(0, 4, 2, 3, 1).reshape(
+        image_tensor.size(2) // patch_height,
+        image_tensor.size(3) // patch_width,
+        image_tensor.size(1) * patch_height * patch_width,
+    )
+    return patches.unsqueeze(0)
+
+
+# similar to transformers.models.pix2struct.image_processing_pix2struct.Pix2StructImageProcessor, but delete is_vqa and additionaly return width and height after resizing
+class Kosmos2_5ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Kosmos2_5 image processor.
+
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. According to Kosmos2_5 paper and code, the image is normalized with its own mean and standard
+            deviation.
+        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+            The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
+        max_patches (`int`, *optional*, defaults to 4096):
+            The maximum number of patches to extract from the image as per the
+            [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
+    """
+
+    model_input_names = ["flattened_patches"]
+
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_normalize: bool = True,
+        patch_size: Optional[dict[str, int]] = None,
+        max_patches: int = 4096,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+        self.do_normalize = do_normalize
+        self.do_convert_rgb = do_convert_rgb
+        self.max_patches = max_patches
+
+    def extract_flattened_patches(
+        self,
+        image: np.ndarray,
+        max_patches: int,
+        patch_size: dict,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Extract flattened patches from an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to extract flattened patches from.
+            max_patches (`int`):
+                Maximum number of patches to extract.
+            patch_size (`dict`):
+                Dictionary containing the patch height and width.
+
+        Returns:
+            result (`np.ndarray`):
+                A sequence of `max_patches` flattened patches.
+        """
+        requires_backends(self.extract_flattened_patches, "torch")
+
+        # convert to torch
+        image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
+        image = torch.from_numpy(image)
+
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
+
+        # maximize scale s.t.
+        scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
+        num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
+        num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
+        resized_height = max(num_feasible_rows * patch_height, 1)
+        resized_width = max(num_feasible_cols * patch_width, 1)
+
+        image = torch.nn.functional.interpolate(
+            image.unsqueeze(0),
+            size=(resized_height, resized_width),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        ).squeeze(0)
+
+        # [1, rows, columns, patch_height * patch_width * image_channels]
+        patches = torch_extract_patches(image, patch_height, patch_width)
+
+        patches_shape = patches.shape
+        rows = patches_shape[1]
+        columns = patches_shape[2]
+        depth = patches_shape[3]
+
+        # [rows * columns, patch_height * patch_width * image_channels]
+        patches = patches.reshape([rows * columns, depth])
+
+        # [rows * columns, 1]
+        row_ids = (
+            torch.arange(rows, device=patches.device)
+            .reshape([rows, 1])
+            .repeat(1, columns)
+            .reshape([rows * columns, 1])
+        )
+        col_ids = (
+            torch.arange(columns, device=patches.device)
+            .reshape([1, columns])
+            .repeat(rows, 1)
+            .reshape([rows * columns, 1])
+        )
+
+        # Offset by 1 so the ids do not contain zeros, which represent padding.
+        row_ids += 1
+        col_ids += 1
+
+        # Prepare additional patch features.
+        # [rows * columns, 1]
+        row_ids = row_ids.to(torch.float32)
+        col_ids = col_ids.to(torch.float32)
+
+        # [rows * columns, 2 + patch_height * patch_width * image_channels]
+        result = torch.cat([row_ids, col_ids, patches], -1)
+
+        # [max_patches, 2 + patch_height * patch_width * image_channels]
+        result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
+
+        result = to_numpy_array(result)
+
+        return result, resized_width, resized_height, rows, columns
+
+    # Copied from transformers.models.pix2struct.image_processing_pix2struct.Pix2StructImageProcessor.normalize
+    def normalize(
+        self,
+        image: np.ndarray,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
+        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if image.dtype == np.uint8:
+            image = image.astype(np.float32)
+
+        # take mean across the whole `image`
+        mean = np.mean(image)
+        std = np.std(image)
+        adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))
+
+        return normalize(
+            image,
+            mean=mean,
+            std=adjusted_stddev,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_convert_rgb: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        max_patches: Optional[int] = None,
+        patch_size: Optional[dict[str, int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> ImageInput:
+        """
+        Preprocess an image or batch of images. The processor first computes the maximum possible number of
+        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
+        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
+        images are standardized following the tensorflow implementation of `per_image_standardization`
+        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).
+
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            max_patches (`int`, *optional*, defaults to `self.max_patches`):
+                Maximum number of patches to extract.
+            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
+                Dictionary containing the patch height and width.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        max_patches = max_patches if max_patches is not None else self.max_patches
+
+        if kwargs.get("data_format") is not None:
+            raise ValueError("data_format is not an accepted input as the outputs are ")
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        flattened_patches, width, height, rows, cols, attention_masks = [], [], [], [], [], []
+        for image in images:
+            if do_normalize:
+                image = self.normalize(image=image, input_data_format=input_data_format)
+
+            # convert to torch tensor and permute
+            patches, resized_width, resized_height, n_rows, n_columns = self.extract_flattened_patches(
+                image=image,
+                max_patches=max_patches,
+                patch_size=patch_size,
+                input_data_format=input_data_format,
+            )
+            flattened_patches.append(patches)
+            width.append(resized_width)
+            height.append(resized_height)
+            rows.append(n_rows)
+            cols.append(n_columns)
+            # create attention mask in numpy
+            attention_masks.append((patches.sum(axis=-1) != 0).astype(np.float32))
+
+        encoded_outputs = BatchFeature(
+            data={
+                "flattened_patches": flattened_patches,
+                "attention_mask": attention_masks,
+                "width": width,
+                "height": height,
+                "rows": rows,
+                "cols": cols,
+            },
+            tensor_type=return_tensors,
+        )
+
+        return encoded_outputs
+
+
+__all__ = ["Kosmos2_5ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9613ed0074f52d27ee6f437feab39b666b6ffe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
@@ -0,0 +1,299 @@
+# coding=utf-8
+# Copyright 2025 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Kosmos2_5."""
+
+import math
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import ChannelDimension, ImageInput, get_image_size
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+# Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
+def torch_extract_patches(image_tensor, patch_height, patch_width):
+    """
+    Utiliy function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
+    (batch_size, `rows`, `columns`, `num_channels` x `patch_height` x `patch_width`).
+
+    Args:
+        image_tensor (torch.Tensor):
+            The image tensor to extract patches from.
+        patch_height (int):
+            The height of the patches to extract.
+        patch_width (int):
+            The width of the patches to extract.
+    """
+    image_tensor = image_tensor
+    patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
+    patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
+    patches = patches.permute(0, 4, 2, 3, 1).reshape(
+        image_tensor.size(0),
+        image_tensor.size(2) // patch_height,
+        image_tensor.size(3) // patch_width,
+        image_tensor.size(1) * patch_height * patch_width,
+    )
+    return patches
+
+
+class Kosmos2_5FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+        The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
+    max_patches (`int`, *optional*, defaults to 4096):
+        The maximum number of patches to extract from the image as per the
+        [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
+    """
+
+    patch_size: Optional[dict[str, int]]
+    max_patches: Optional[int]
+
+
+@auto_docstring
+class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
+    # To be checked against the slow image processor
+    # None values left after checking can be removed
+    do_normalize = True
+    do_convert_rgb = True
+    patch_size = {"height": 16, "width": 16}
+    max_patches = 4096
+    rescale_factor = None
+    valid_kwargs = Kosmos2_5FastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]) -> BatchFeature:
+        r"""
+        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+            The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
+        max_patches (`int`, *optional*, defaults to 4096):
+            The maximum number of patches to extract from the image as per the
+            [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
+        """
+        # return super().preprocess(images, **kwargs)
+        # TODO: revert once the issue is fixed: https://huggingface.slack.com/archives/C02TXKQQLE5/p1743411133979019
+        return super().preprocess(images, image_mean=0.0, image_std=0.0, **kwargs)
+
+    def normalize(
+        self,
+        image: "torch.Tensor",
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
+        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
+
+        Args:
+            image (`torch.Tensor`):
+                Image to normalize.
+        """
+        # Q: should we keep this?
+        if image.dtype == torch.uint8:
+            image = image.to(dtype=torch.float32)
+
+        # take mean across the whole `image` except the batch dim (= 0).
+        dim = list(range(1, image.ndim))
+        mean = torch.mean(image, dim=dim)
+        std = torch.std(image, dim=dim)
+        # num_elements in a single image
+        num_elements = torch.tensor(torch.numel(image[0]))
+        adjusted_stddev = torch.max(std, 1.0 / torch.sqrt(num_elements))
+
+        # change `image` from [batch_size, n_channels, width, height] to [n_channels, batch_size, width, height]
+        image = torch.transpose(image, 0, 1)
+
+        # 'torchvision.transforms.Normalize` works on the usual channel dimension (dim=1) which is the batch
+        # dimension before we use `transpose`.
+        image = super().normalize(
+            image,
+            mean=mean,
+            std=adjusted_stddev,
+            **kwargs,
+        )
+        # back to [batch_size, n_channels, width, height]
+        normalized_image = torch.transpose(image, 0, 1)
+
+        return normalized_image
+
+    def extract_flattened_patches(
+        self,
+        image: "torch.Tensor",
+        max_patches: int,
+        patch_size: dict,
+        # TODO: correct this return type, and the docstring
+    ) -> "torch.Tensor":
+        """
+        Extract flattened patches from an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to extract flattened patches from.
+            max_patches (`int`):
+                Maximum number of patches to extract.
+            patch_size (`dict`):
+                Dictionary containing the patch height and width.
+
+        Returns:
+            result (`np.ndarray`):
+                A sequence of `max_patches` flattened patches.
+        """
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
+
+        # maximize scale s.t.
+        scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
+        num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
+        num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
+        resized_height = max(num_feasible_rows * patch_height, 1)
+        resized_width = max(num_feasible_cols * patch_width, 1)
+
+        image = torch.nn.functional.interpolate(
+            image,
+            size=(resized_height, resized_width),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        )
+
+        # [batch_size, rows, columns, patch_height * patch_width * image_channels]
+        patches = torch_extract_patches(image, patch_height, patch_width)
+
+        patches_shape = patches.shape
+        batch_size = patches_shape[0]
+        rows = patches_shape[1]
+        columns = patches_shape[2]
+        depth = patches_shape[3]
+
+        # [batch_size, rows * columns, patch_height * patch_width * image_channels]
+        patches = patches.reshape([batch_size, rows * columns, depth])
+
+        # [rows * columns, 1]
+        row_ids = (
+            torch.arange(rows, device=patches.device)
+            .reshape([rows, 1])
+            .repeat(1, columns)
+            .reshape([rows * columns, 1])
+        )
+        col_ids = (
+            torch.arange(columns, device=patches.device)
+            .reshape([1, columns])
+            .repeat(rows, 1)
+            .reshape([rows * columns, 1])
+        )
+
+        # Offset by 1 so the ids do not contain zeros, which represent padding.
+        row_ids += 1
+        col_ids += 1
+
+        # Prepare additional patch features.
+        # [batch_size, rows * columns, 1]
+        row_ids = row_ids.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.float32)
+        col_ids = col_ids.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.float32)
+
+        # [rows * columns, 2 + patch_height * patch_width * image_channels]
+        result = torch.cat([row_ids, col_ids, patches], -1)
+
+        # [batch_size, max_patches, 2 + patch_height * patch_width * image_channels]
+        result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
+
+        return result, resized_width, resized_height, rows, columns
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_normalize: bool,
+        max_patches: int,
+        patch_size: dict[str, int],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Q: should we have this?
+        if kwargs.get("data_format") is not None:
+            raise ValueError("data_format is not an accepted input as the outputs are ")
+
+        width, height, rows, cols, attention_masks = [], [], [], [], []
+        obj_idx_to_new_index_map = {}
+        current_index = -1
+
+        # Group images by size for batched resizing
+        processed_image_patches_grouped = {}
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        for shape, stacked_images in grouped_images.items():
+            # TODO: if it's possible to do in batch mode
+            if do_normalize:
+                stacked_images = self.normalize(stacked_images, **kwargs)
+
+            # TODO: we need this to be in batch from
+            # convert to torch tensor and permute
+            patches, resized_width, resized_height, n_rows, n_columns = self.extract_flattened_patches(
+                image=stacked_images,
+                max_patches=max_patches,
+                patch_size=patch_size,
+            )
+            n_of_stacked_images = stacked_images.size()[0]
+            width.extend([resized_width] * n_of_stacked_images)
+            height.extend([resized_height] * n_of_stacked_images)
+            rows.extend([n_rows] * n_of_stacked_images)
+            cols.extend([n_columns] * n_of_stacked_images)
+            # create attention mask in numpy
+            attention_masks.extend(list((patches.sum(axis=-1) != 0).to(dtype=torch.float32)))
+            processed_image_patches_grouped[shape] = list(patches)
+            for x in processed_image_patches_grouped[shape]:
+                current_index += 1
+                obj_idx_to_new_index_map[id(x)] = current_index
+
+        processed_images = reorder_images(processed_image_patches_grouped, grouped_images_index)
+        orig_idx_to_new_idx_map = {
+            orig_idx: obj_idx_to_new_index_map[id(image)] for orig_idx, image in enumerate(processed_images)
+        }
+
+        flattened_patches = processed_images
+        width = [width[orig_idx_to_new_idx_map[orig_idx]] for orig_idx in orig_idx_to_new_idx_map]
+        height = [height[orig_idx_to_new_idx_map[orig_idx]] for orig_idx in orig_idx_to_new_idx_map]
+        rows = [rows[orig_idx_to_new_idx_map[orig_idx]] for orig_idx in orig_idx_to_new_idx_map]
+        cols = [cols[orig_idx_to_new_idx_map[orig_idx]] for orig_idx in orig_idx_to_new_idx_map]
+
+        encoded_outputs = BatchFeature(
+            data={
+                "flattened_patches": torch.stack(flattened_patches, dim=0) if return_tensors else flattened_patches,
+                "attention_mask": torch.stack(attention_masks, dim=0) if return_tensors else attention_masks,
+                "width": width,
+                "height": height,
+                "rows": rows,
+                "cols": cols,
+            },
+            tensor_type=return_tensors,
+        )
+
+        return encoded_outputs
+
+
+__all__ = ["Kosmos2_5ImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/modeling_kosmos2_5.py b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/modeling_kosmos2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e692273c7168a81da3a9b0ef008513c56700f6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/modeling_kosmos2_5.py
@@ -0,0 +1,1841 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch KOSMOS-2.5 model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithCrossAttentions,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    ModelOutput,
+    TransformersKwargs,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_kosmos2_5 import (
+    Kosmos2_5Config,
+    Kosmos2_5TextConfig,
+    Kosmos2_5VisionConfig,
+)
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = Kosmos2_5Config
+
+
+# Copied from transformers.models.kosmos2.modeling_kosmos2._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+KOSMOS2_5_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Kosmos2_5Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+KOSMOS2_5_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        flattened_patches (`torch.FloatTensor` of shape `(batch_size, max_patches, 2 + patch_height * patch_width * image_channels)`):
+            Flattened patches of the images. `flattened_patches` can be obtained using [`AutoImageProcessor`]. See
+            [`Kosmos2_5ImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+"""
+
+KOSMOS2_5_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        image_embeds: (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
+            1]`:
+
+            - 1 for places where to put the image features,
+            - 0 for places that are not for image features (i.e. for text tokens).
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+"""
+
+KOSMOS2_5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        flattened_patches (`torch.FloatTensor` of shape `(batch_size, max_patches, 2 + patch_height * patch_width * image_channels)`):
+            Flattened patches of the images. `flattened_patches` can be obtained using [`AutoImageProcessor`]. See
+            [`Kosmos2_5ImageProcessor.__call__`] for details.
+        width (`torch.FloatTensor` of shape `(batch_size,)`):
+            The original width (before resizing) of each image in the batch. This can be obtained using
+            [`AutoImageProcessor`]. See [`Kosmos2_5ImageProcessor.__call__`] for details.
+        height (`torch.FloatTensor` of shape `(batch_size,)`):
+            The original height (before resizing) of each image in the batch. This can be obtained using
+            [`AutoImageProcessor`]. See [`Kosmos2_5ImageProcessor.__call__`] for details.
+        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
+            1]`:
+
+            - 1 for places where to put the image features,
+            - 0 for places that are not for image features (i.e. for text tokens).
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        image_embeds: (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+"""
+
+
+@dataclass
+class Kosmos2_5ModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        width (`torch.FloatTensor` of shape `(batch_size,)`):
+            The original width (before resizing) of each image in the batch.
+        height (`torch.FloatTensor` of shape `(batch_size,)`):
+            The original height (before resizing) of each image in the batch.
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+        projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
+            the weighted average in the self-attention heads.
+        vision_model_output(`BaseModelOutputWithPooling`, *optional*):
+            The output of the [`Kosmos2VisionModel`].
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    width: Optional[torch.FloatTensor] = None
+    height: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    projection_attentions: Optional[tuple[torch.FloatTensor]] = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            (self[k] if k not in ["vision_model_output"] else getattr(self, k).to_tuple()) for k in self.keys()
+        )
+
+
+@dataclass
+class Kosmos2_5ForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Model output class for `Kosmos2_5ForConditionalGeneration`.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        width (`torch.FloatTensor` of shape `(batch_size,)`):
+            The original width (before resizing) of each image in the batch.
+        height (`torch.FloatTensor` of shape `(batch_size,)`):
+            The original height (before resizing) of each image in the batch.
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
+        projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
+            the weighted average in the self-attention heads.
+        vision_model_output(`BaseModelOutputWithPooling`, *optional*):
+            The output of the [`Kosmos2VisionModel`].
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    width: Optional[torch.FloatTensor] = None
+    height: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    projection_attentions: Optional[tuple[torch.FloatTensor]] = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            (self[k] if k not in ["vision_model_output"] else getattr(self, k).to_tuple()) for k in self.keys()
+        )
+
+
+# Copied from transformers.models.pix2struct.modeling_pix2struct.Pix2StructLayerNorm with Pix2Struct->Kosmos2_5
+class Kosmos2_5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://huggingface.co/papers/1910.07467 thus variance is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+# similar to transformers.models.pix2struct.modeling_pix2struct.Pix2StructVisionEmbeddings but with `inplace=False`
+# TODO: check with krip
+class Kosmos2_5VisionEmbeddings(nn.Module):
+    def __init__(self, config: Kosmos2_5VisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.patch_projection = nn.Linear(config.patch_embed_hidden_size, config.hidden_size)
+
+        self.row_embedder = nn.Embedding(config.max_num_patches, config.hidden_size)
+        self.column_embedder = nn.Embedding(config.max_num_patches, config.hidden_size)
+
+        self.dropout = nn.Dropout(config.dropout_rate, inplace=False)
+
+    def forward(self, flattened_patches: torch.Tensor) -> torch.Tensor:
+        # the row and column indices are stored in the first and second position of the flattened_patches
+        # flattened_patches: `batch_size`, `seq_len`, `hidden_size` + 2
+        row_indices = flattened_patches[:, :, 0].long()
+        col_indices = flattened_patches[:, :, 1].long()
+
+        flattened_patches = flattened_patches[:, :, 2:]
+
+        embeddings = self.patch_projection(flattened_patches)
+        row_embeddings = self.row_embedder(row_indices).to(embeddings.device)
+        col_embeddings = self.column_embedder(col_indices).to(embeddings.device)
+
+        # sum all embeddings together
+        embeddings = embeddings + row_embeddings + col_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5DenseGatedActDense->Pix2StructVisionMlp,T5Config->Pix2StructVisionConfig,config.d_model->config.hidden_size,dropout_rate->dropout_rate
+class Kosmos2_5VisionMlp(nn.Module):
+    def __init__(self, config: Kosmos2_5VisionConfig):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.wi_1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.wo = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+        # Ignore copy
+        self.config = config
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # this weight maybe overflow with fp16
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Kosmos2_5VisionAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.head_dim
+        self.n_heads = config.num_attention_heads
+        self.dropout = config.attention_dropout
+        self.inner_dim = self.n_heads * self.head_dim
+        self.is_causal = False
+        self.scaling = self.head_dim**-0.5
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.query = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
+        self.key = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
+        self.value = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
+        self.output = nn.Linear(self.inner_dim, self.hidden_size, bias=False)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        """
+        Self-attention block
+        """
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.key(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1)
+        attn_output = self.output(attn_output)
+
+        return attn_output, attn_weights
+
+
+class Kosmos2_5VisionLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Kosmos2_5VisionConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        self.attention = Kosmos2_5VisionAttention(config)
+        self.mlp = Kosmos2_5VisionMlp(config)
+        self.pre_mlp_layer_norm = Kosmos2_5LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pre_attention_layer_norm = Kosmos2_5LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        residual = hidden_states
+
+        # in  Kosmos2_5Vision, layernorm is applied before self-attention
+        hidden_states = self.pre_attention_layer_norm(hidden_states)
+
+        attention_output, self_attn_weights = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+
+        # first residual connection
+        hidden_states = attention_output + residual
+
+        # in  Kosmos2_5Vision, layernorm is also applied after self-attention
+        layer_output = self.pre_mlp_layer_norm(hidden_states)
+        layer_output = self.mlp(layer_output) + hidden_states  # second residual connection
+
+        outputs = (layer_output,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+# Adapted from transformers.models.pix2struct.modeling_pix2struct.Pix2StructVisionEncoder with Pix2Struct->Kosmos2_5
+class Kosmos2_5VisionEncoder(nn.Module):
+    def __init__(self, config: Kosmos2_5VisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Kosmos2_5VisionLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def _prepare_attention_mask(self, attention_mask, input_shape, inputs_embeds):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+        return expanded_attn_mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        attention_mask = self._prepare_attention_mask(attention_mask, hidden_states.shape[:2], hidden_states)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, attention_mask, output_attentions, **kwargs)
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.kosmos2.modeling_kosmos2.Kosmos2TextSinusoidalPositionalEmbedding with Kosmos2->Kosmos2_5
+class Kosmos2_5TextSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.__init__
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.make_weights
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.get_embedding
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            if position_ids is None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids, self.padding_idx, past_key_values_length
+                ).to(input_ids.device)
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            if position_ids is None:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+# Copied from transformers.models.kosmos2.modeling_kosmos2.Kosmos2TextFFN with Kosmos2->Kosmos2_5
+class Kosmos2_5TextFFN(nn.Module):
+    def __init__(self, config: Kosmos2_5TextConfig):
+        super().__init__()
+
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.fc1 = nn.Linear(config.embed_dim, config.ffn_dim)
+        self.fc2 = nn.Linear(config.ffn_dim, config.embed_dim)
+
+        self.ffn_layernorm = nn.LayerNorm(config.ffn_dim, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.ffn_layernorm(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        return hidden_states
+
+
+class Kosmos2_5TextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal=True,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.is_causal = is_causal
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,  # text part
+        encoder_hidden_states: Optional[torch.Tensor] = None,  # image part
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        # use encoder_hidden_states if cross attention
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+
+        current_input_shape = current_states.shape[:-1]
+        current_hidden_shape = (*current_input_shape, -1, self.head_dim)
+
+        key_states = self.k_proj(current_states).view(current_hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(current_states).view(current_hidden_shape).transpose(1, 2)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        # Apply `self.scaling`
+        query_states = self.scaling * query_states
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=1.0,  # We don't use `self.scaling` as it's already applied to `query_states` above .
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Kosmos2_5TextBlock(GradientCheckpointingLayer):
+    def __init__(self, config: Kosmos2_5TextConfig, layer_idx: int):
+        super().__init__()
+        self.embed_dim = config.embed_dim
+        self.layer_idx = layer_idx
+        self.self_attn = Kosmos2_5TextAttention(
+            config,
+            embed_dim=self.embed_dim,
+            num_heads=config.attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.ffn = Kosmos2_5TextFFN(config)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    # Adapted from transformers.models.kosmos2.modeling_kosmos2.Kosmos2TextBlock.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+# Adapted from transformers.models.kosmos2.modeling_kosmos2.Kosmos2TextTransformer with Kosmos2->Kosmos2_5
+class Kosmos2_5TextTransformer(nn.Module):
+    """
+    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2_5TextBlock`].
+    Here we doesn't have cross attention.
+    Args:
+        config: Kosmos2_5TextConfig
+    """
+
+    def __init__(self, config: Kosmos2_5TextConfig):
+        super().__init__()
+        self.config = config
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+
+        self.embed_scale = math.sqrt(config.embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.embed_dim, padding_idx=config.pad_token_id)
+
+        self.embed_positions = Kosmos2_5TextSinusoidalPositionalEmbedding(
+            num_positions=config.max_position_embeddings,
+            embedding_dim=config.embed_dim,
+            padding_idx=config.pad_token_id,
+        )
+
+        # Ignore copy
+        self.segment_emb = nn.Embedding(2, config.embed_dim)
+        self.layers = nn.ModuleList([Kosmos2_5TextBlock(config, layer_idx) for layer_idx in range(config.layers)])
+        self.layer_norm = nn.LayerNorm(config.embed_dim, config.layer_norm_eps)
+        self.gradient_checkpointing = False
+
+    # TODO (ydshieh): Remove this (to match Llama's code)
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # TODO (ydshieh): Remove this (to match Llama's code)
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        # The argument `inputs_embeds` should be the one without being multiplied by `self.embed_scale`.
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # Ignore copy
+        if image_embeds is not None:
+            inputs_embeds[image_embeds_position_mask == 1] = image_embeds.to(inputs_embeds.device).view(
+                -1, image_embeds.size(-1)
+            )
+
+        inputs_embeds = inputs_embeds * self.embed_scale
+
+        # embed positions
+        positions = self.embed_positions(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=0,
+            position_ids=position_ids,
+        )
+        positions = positions.to(inputs_embeds.device)
+
+        # Ignore copy
+        if image_embeds_position_mask is not None:
+            # make every not equal 0 be 1
+            image_embeds_position_mask = image_embeds_position_mask.ne(0).long()
+            segment_embeds = self.segment_emb(image_embeds_position_mask).to(positions.device)
+            positions += segment_embeds
+        else:
+            # add zero embedding for padding tokens
+            bsz, seq_len, dim = positions.size()
+            zero_emb = self.segment_emb(
+                torch.zeros((bsz, 1), dtype=torch.long, device=self.segment_emb.weight.device)
+            ).to(positions.device)
+            positions += zero_emb
+
+        hidden_states = inputs_embeds + positions
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        # add final layer norm
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output
+
+
+class Kosmos2_5ImageToTextProjection(nn.Module):
+    """The layer that transforms the image model's output to part of the text model's input (namely, image features)"""
+
+    def __init__(self, config: Kosmos2_5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.vision_config.hidden_size, config.text_config.embed_dim)
+        self.latent_query = nn.Parameter(torch.randn(config.latent_query_num, config.text_config.embed_dim))
+
+        # Ignore copy
+        self.x_attn = Kosmos2_5TextAttention(
+            config.text_config,
+            config.text_config.embed_dim,
+            config.text_config.attention_heads,
+            dropout=config.text_config.attention_dropout,
+            is_decoder=False,
+            is_causal=False,
+        )
+
+    def forward(self, features):
+        hidden_states = self.dense(features)
+
+        # shape = [batch, latent_query_num, h_dim]
+        latent_query = self.latent_query.unsqueeze(0).expand(hidden_states.size(0), -1, -1)
+        key_value_states = torch.cat([hidden_states, latent_query], dim=1)
+
+        hidden_states, attn_weights = self.x_attn(
+            hidden_states=latent_query,
+            encoder_hidden_states=key_value_states,
+            past_key_value=None,
+            attention_mask=None,
+            output_attentions=None,
+            is_causal=False,
+        )
+
+        return hidden_states, attn_weights
+
+
+class Kosmos2_5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Kosmos2_5Config
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Kosmos2_5VisionLayer", "Kosmos2_5TextBlock"]
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+    _supports_sdpa = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(self, Kosmos2_5VisionModel):
+            init_factor = self.config.initializer_factor
+            std = self.config.initializer_range * init_factor
+        elif isinstance(self, (Kosmos2_5TextModel, Kosmos2_5TextForCausalLM)):
+            std = self.config.init_std
+        elif isinstance(self, (Kosmos2_5Model, Kosmos2_5ForConditionalGeneration)):
+            std = self.config.text_config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (nn.LayerNorm, Kosmos2_5LayerNorm)):
+            module.weight.data.fill_(1.0)
+            if getattr(module, "bias", None) is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, Kosmos2_5ImageToTextProjection):
+            module.latent_query.data.normal_(mean=0.0, std=1.0)
+
+
+class Kosmos2_5VisionModel(Kosmos2_5PreTrainedModel):
+    config_class = Kosmos2_5VisionConfig
+
+    # Copied from transformers.models.pix2struct.modeling_pix2struct.Pix2StructVisionModel.__init__ with Pix2Struct->Kosmos2_5
+    def __init__(self, config: Kosmos2_5VisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Kosmos2_5VisionEmbeddings(config)
+        self.encoder = Kosmos2_5VisionEncoder(config)
+
+        self.layernorm = Kosmos2_5LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.pix2struct.modeling_pix2struct.Pix2StructVisionModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings.patch_projection
+
+    # Copied from transformers.models.pix2struct.modeling_pix2struct.Pix2StructVisionModel._prune_heads
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    # Similar to transformers.models.pix2struct.modeling_pix2struct.Pix2StructVisionModel.forward without docstring
+    def forward(
+        self,
+        flattened_patches: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if flattened_patches is None:
+            raise ValueError("You have to specify flattened_patches")
+
+        if attention_mask is None:
+            # check where `flattened_patches` is not 0
+            attention_mask = (flattened_patches.sum(dim=-1) != 0).float()
+
+        embedding_output = self.embeddings(flattened_patches)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            **kwargs,
+        )
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Adapted from transformers.models.kosmos2.modeling_kosmos2.Kosmos2TextModel with KOSMOS2->KOSMOS2_5
+class Kosmos2_5TextModel(Kosmos2_5PreTrainedModel):
+    config_class = Kosmos2_5TextConfig
+
+    def __init__(self, config: Kosmos2_5TextConfig):
+        super().__init__(config)
+        self.model = Kosmos2_5TextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(KOSMOS2_5_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=Kosmos2_5TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
+        r"""
+        Returns:
+
+        """
+        return self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+
+@add_start_docstrings(
+    """
+    KOSMOS-2.5 Model for generating text and image features. The model consists of a vision encoder and a language model.
+    """,
+    KOSMOS2_5_START_DOCSTRING,
+)
+class Kosmos2_5Model(Kosmos2_5PreTrainedModel):
+    config_class = Kosmos2_5Config
+
+    def __init__(self, config: Kosmos2_5Config):
+        super().__init__(config)
+
+        self.text_model = Kosmos2_5TextModel._from_config(config.text_config)
+        self.vision_model = Kosmos2_5VisionModel._from_config(config.vision_config)
+        self.image_to_text_projection = Kosmos2_5ImageToTextProjection(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_model.model.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(KOSMOS2_5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Kosmos2_5ModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        flattened_patches: Optional[torch.Tensor] = None,
+        width: Optional[torch.Tensor] = None,
+        height: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Kosmos2_5ModelOutput:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Kosmos2_5Model
+
+        >>> model = Kosmos2_5Model.from_pretrained("microsoft/kosmos2.5")
+        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos2.5")
+
+        >>> url = "https://huggingface.co/microsoft/kosmos2.5/resolve/main/snowman.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = (
+        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
+        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
+        ...     "</object>"
+        ... )
+
+        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)
+
+        >>> last_hidden_state = model(
+        ...     pixel_values=inputs["pixel_values"],
+        ...     input_ids=inputs["input_ids"],
+        ...     attention_mask=inputs["attention_mask"],
+        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
+        ... ).last_hidden_state
+        >>> list(last_hidden_state.shape)
+        [1, 91, 2048]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_model_output = None
+        projection_attentions = None
+        if image_embeds is None:
+            if flattened_patches is not None:
+                vision_model_output = self.vision_model(
+                    flattened_patches=flattened_patches,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    **kwargs,
+                )
+                # normalized features
+                image_embeds = nn.functional.normalize(vision_model_output.last_hidden_state, dim=-1)
+                image_embeds, projection_attentions = self.image_to_text_projection(image_embeds)
+
+        outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Kosmos2_5ModelOutput(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            width=width,
+            height=height,
+            image_embeds=image_embeds,
+            projection_attentions=projection_attentions,
+            vision_model_output=vision_model_output,
+        )
+
+
+@add_start_docstrings(
+    """
+    The text model from KOSMOS-2.5 with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    KOSMOS2_5_START_DOCSTRING,
+)
+class Kosmos2_5TextForCausalLM(Kosmos2_5PreTrainedModel):
+    config_class = Kosmos2_5TextConfig
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Kosmos2_5TextConfig):
+        super().__init__(config)
+
+        self.model = Kosmos2_5TextTransformer(config)
+        self.lm_head = nn.Linear(in_features=config.embed_dim, out_features=config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(KOSMOS2_5_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=Kosmos2_5TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithCrossAttentions:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            **kwargs,
+        )
+        lm_logits = self.lm_head(outputs.last_hidden_state)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            lm_loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        image_embeds=None,
+        image_embeds_position_mask=None,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        cache_position=None,
+        position_ids=None,
+        **model_kwargs,
+    ):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        position_ids = None
+
+        # cut input_ids if past_key_values is used
+        if past_key_values is not None:
+            position_ids = create_position_ids_from_input_ids(
+                input_ids,
+                padding_idx=self.config.pad_token_id,
+                past_key_values_length=0,
+            )[:, -cache_position.shape[0] :]
+
+            input_ids = input_ids[:, -cache_position.shape[0] :]
+            # the image info. is already encoded into the past keys/values
+            if past_key_values.get_seq_length() > 0:
+                image_embeds = None
+                image_embeds_position_mask = None
+        elif image_embeds_position_mask is not None:
+            # appending `False` to `image_embeds_position_mask` (because `input_ids` grows during generation)
+            batch_size, seq_len = input_ids.size()
+            mask_len = image_embeds_position_mask.size()[-1]
+            image_embeds_position_mask = torch.cat(
+                (
+                    image_embeds_position_mask,
+                    torch.zeros(size=(batch_size, seq_len - mask_len), dtype=torch.bool, device=input_ids.device),
+                ),
+                dim=1,
+            )
+
+        return {
+            "input_ids": input_ids,
+            "image_embeds": image_embeds,
+            "image_embeds_position_mask": image_embeds_position_mask,
+            "past_key_values": past_key_values,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "use_cache": use_cache,
+        }
+
+
+@add_start_docstrings(
+    """
+    KOSMOS-2.5 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
+    language model.
+    """,
+    KOSMOS2_5_START_DOCSTRING,
+)
+class Kosmos2_5ForConditionalGeneration(Kosmos2_5PreTrainedModel, GenerationMixin):
+    config_class = Kosmos2_5Config
+    _tied_weights_keys = ["text_model.lm_head.weight"]
+
+    def __init__(self, config: Kosmos2_5Config):
+        super().__init__(config)
+        self.text_model = Kosmos2_5TextForCausalLM(config.text_config)
+        self.vision_model = Kosmos2_5VisionModel(config.vision_config)
+        self.image_to_text_projection = Kosmos2_5ImageToTextProjection(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_model.model.embed_tokens = value
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.text_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.text_model.set_output_embeddings(new_embeddings)
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(KOSMOS2_5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Kosmos2_5ForConditionalGenerationModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        flattened_patches: Optional[torch.Tensor] = None,
+        width: Optional[torch.Tensor] = None,
+        height: Optional[torch.Tensor] = None,
+        image_embeds_position_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Kosmos2_5ForConditionalGenerationModelOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import torch
+        >>> from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
+
+        >>> repo = "microsoft/kosmos-2.5"
+        >>> device = "cuda:0"
+        >>> dtype = torch.bfloat16 # torch.float16
+        >>> model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
+        >>> processor = AutoProcessor.from_pretrained(repo)
+
+        >>> url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
+
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> prompt = "<ocr>" # <md>
+
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> height, width = inputs.pop("height"), inputs.pop("width")
+        >>> inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
+        >>> inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
+
+        >>> generated_ids = model.generate(**inputs,max_new_tokens=1024)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> generated_text
+        '<ocr><bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_612></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_650></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_644></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_687></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n'
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_model_output = None
+        projection_attentions = None
+
+        if image_embeds is None:
+            if flattened_patches is not None:
+                vision_model_output = self.vision_model(
+                    flattened_patches=flattened_patches,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    **kwargs,
+                )
+                image_embeds = nn.functional.normalize(vision_model_output.last_hidden_state, dim=-1)
+                image_embeds, projection_attentions = self.image_to_text_projection(image_embeds)
+
+        lm_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            **kwargs,
+        )
+
+        return Kosmos2_5ForConditionalGenerationModelOutput(
+            loss=lm_outputs.loss,
+            logits=lm_outputs.logits,
+            past_key_values=lm_outputs.past_key_values,
+            hidden_states=lm_outputs.hidden_states,
+            attentions=lm_outputs.attentions,
+            width=width,
+            height=height,
+            image_embeds=image_embeds,
+            projection_attentions=projection_attentions,
+            vision_model_output=vision_model_output,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        flattened_patches=None,
+        image_embeds=None,
+        image_embeds_position_mask=None,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        cache_position=None,
+        position_ids=None,
+        **model_kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = self.text_model.prepare_inputs_for_generation(
+            input_ids,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            **model_kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, `flattened_patches` should be `None` because `input_ids` do not contain special image token anymore
+            # Otherwise we need `flattened_patches` to be passed to model
+            model_inputs["flattened_patches"] = flattened_patches
+
+        return model_inputs
+
+
+__all__ = [
+    "Kosmos2_5ForConditionalGeneration",
+    "Kosmos2_5Model",
+    "Kosmos2_5PreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/kosmos2_5/processing_kosmos2_5.py b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/processing_kosmos2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e780ca2db9aeea301c164c47f3308100dd5a1fb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Kosmos2_5.
+"""
+
+from typing import Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import TextInput
+from ...utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
+    max_patches: Optional[int]
+    num_image_tokens: Optional[int]
+
+
+class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: TextKwargs
+    images_kwargs: Kosmos2_5ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": True,
+            "return_token_type_ids": False,
+            "stride": 0,
+            "truncation": True,
+        },
+        "images_kwargs": {
+            "max_patches": 4096,
+            "num_image_tokens": 2048,
+        },
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
+class Kosmos2_5Processor(ProcessorMixin):
+    r"""
+    Constructs a Kosmos2_5 processor which wraps a PreTrainedTokenizerFast and Kosmos2_5 image processor into a single
+    processor.
+
+    [`Kosmos2_5Processor`] offers all the functionalities of [`Kosmos2_5ImageProcessor`] and [`PreTrainedTokenizerFast`]. See
+    the docstring of [`~Kosmos2_5Processor.__call__`] and [`~Kosmos2_5Processor.decode`] for more information.
+
+    Args:
+        image_processor (`Kosmos2_5ImageProcessor`):
+            An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
+        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
+            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+
+    def __init__(self, image_processor, tokenizer):
+        self.image_start_token = tokenizer.boi_token  # "<image>" : fixed token for the start of image
+        self.image_end_token = tokenizer.eoi_token  # "</image>" : fixed token for the end of image
+        self.image_token = tokenizer.image_token  # "<s>" : within a <image> ... </image> pair, these <s> tokens indicate they are positions reserved for an image
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, list[TextInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Kosmos2_5ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
+        [`PreTrainedTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+
+        The rest of this documentation shows the arguments specific to `Kosmos2_5Processor`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        if images is None:
+            raise ValueError("Kosmos2_5Processor requires images to be passed.")
+
+        output_kwargs = self._merge_kwargs(
+            Kosmos2_5ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)
+
+        encoding = BatchFeature()
+
+        if images is not None:
+            image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_encoding.pop("rows")
+            image_encoding.pop("cols")
+            encoding.update(image_encoding)
+
+        prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * num_image_tokens}{self.image_end_token}"
+
+        if text is not None:
+            if isinstance(text, str):
+                text = [prompt + text]
+            else:
+                text = [prompt + t for t in text]
+            input = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+            batch_size, seq_len = input.input_ids.shape
+            image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1]
+            image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask))
+            image_embeds_position_mask = (
+                torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
+            )
+
+            encoding.update(
+                {
+                    "input_ids": input.input_ids,
+                    "attention_mask": input.attention_mask,
+                    "image_embeds_position_mask": image_embeds_position_mask,
+                }
+            )
+
+        return encoding
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["Kosmos2_5Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__init__.py b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5823883c6cb8ba35c0964d62d4f32e5ca3d24d64
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_kyutai_speech_to_text import *
+    from .feature_extraction_kyutai_speech_to_text import *
+    from .modeling_kyutai_speech_to_text import *
+    from .processing_kyutai_speech_to_text import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bae8929587b3e0e1cc3e569bb3cb9cb43f65d352
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c19a7247d1fd3e9eaf9a54331a71053c797e1bd
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea47bcbb6c50f7368e56fd7f9d62ad7d03905935
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd28b18a9b56313647e47bea21ea039073f79fe2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e853332c5ac92f235209dc97bb718c0e98ad1166
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4702c93873f8b59758edad66ff943a46461d4b0c
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..8693fd66679d6e6a747eaf30627d65d8f48a0ca7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.s
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class KyutaiSpeechToTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`KyutaiSpeechToTextForConditionalGeneration`].
+    It is used to instantiate a Kyutai Speech-to-Text model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    2.6b-en model.
+
+    e.g. [kyutai/stt-2.6b-en-trfs](https://huggingface.co/kyutai/stt-2.6b-en-trfs)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        codebook_vocab_size (`int`, *optional*, defaults to 2049):
+            Vocabulary size of the codebook. Defines the number of different audio tokens that can be represented by each codebook.
+        vocab_size (`int`, *optional*, defaults to 4001):
+            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
+            `input_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the layers and the pooler layer of the main decoder.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of decoder layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the main decoder block.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        max_position_embeddings (`int`, *optional*, defaults to 750):
+            The maximum sequence length that this model might ever be used with. Typically, set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        rope_theta (`float`, *optional*, defaults to 100000.0):
+            The base period of the RoPE embeddings.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
+            The attention head dimension.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        sliding_window (`int`, *optional*, defaults to 375):
+            Sliding window attention window size. If not specified, will default to `3000`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        ffn_dim (`int`, *optional*, defaults to 11264):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the main decoder block. Must be even.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-08):
+            The epsilon used by the rms normalization layers.
+        num_codebooks (`int`, *optional*, defaults to 32):
+            The number of audio codebooks for each audio channels.
+        audio_bos_token_id (`int`, *optional*, defaults to 2048):
+            Beginning of stream token id for codebook tokens.
+        audio_pad_token_id (`int`, *optional*, defaults to 69569):
+            Padding token id for codebook tokens.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings.
+        pad_token_id (`int`, *optional*, defaults to 3):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 48000):
+            Beginning of stream token id for text tokens.
+        codec_config (`PretrainedConfig`, *optional*):
+            Configuration for the codec.
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+                - **audio_encoder_config** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the audio encoder config.
+                - **depth__config** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the depth decoder config.
+
+
+    Example:
+    ```python
+    >>> from transformers import KyutaiSpeechToTextConfig, KyutaiSpeechToTextForConditionalGeneration
+
+    >>> # Initializing a KyutaiSpeechToTextConfig
+    >>> configuration = KyutaiSpeechToTextConfig()
+
+    >>> # Initializing a model
+    >>> model = KyutaiSpeechToTextForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "kyutai_speech_to_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"codec_config": AutoConfig}
+
+    def __init__(
+        self,
+        codebook_vocab_size=2049,
+        vocab_size=4001,
+        hidden_size=2048,
+        num_hidden_layers=48,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        max_position_embeddings=750,
+        rope_theta=100000.0,
+        hidden_act="silu",
+        head_dim=None,
+        initializer_range=0.02,
+        use_cache=True,
+        sliding_window=375,
+        attention_dropout=0.0,
+        ffn_dim=11264,
+        rms_norm_eps=1e-8,
+        num_codebooks=32,
+        audio_bos_token_id=2048,
+        audio_pad_token_id=69569,
+        tie_word_embeddings=False,
+        pad_token_id=3,
+        bos_token_id=48000,
+        codec_config=None,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
+
+        if codec_config is None:
+            self.codec_config = AutoConfig.for_model("mimi")
+            logger.info("codec_config is None, using default audio encoder config.")
+        elif isinstance(codec_config, dict):
+            self.codec_config = AutoConfig.for_model(**codec_config)
+        elif isinstance(codec_config, PretrainedConfig):
+            self.codec_config = codec_config
+
+        self.num_codebooks = num_codebooks
+        self.frame_size = self.codec_config.frame_size
+
+        self.audio_bos_token_id = audio_bos_token_id
+        self.audio_pad_token_id = audio_pad_token_id
+        self.codebook_vocab_size = codebook_vocab_size
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        if ffn_dim % 2 == 1:
+            raise ValueError(f"`ffn_dim={ffn_dim}` must be even.")
+        self.ffn_dim = ffn_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.sliding_window = sliding_window
+
+
+__all__ = ["KyutaiSpeechToTextConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde1736f9da859ea1f40a5a2728399194542c023
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
@@ -0,0 +1,237 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_kyutai_speech_to_text.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Kyutai and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class KyutaiSpeechToTextFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an KyutaiSpeechToText feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+        audio_delay_seconds (`float`, *optional*, defaults to 0.0):
+            The delay in seconds to add after the audio (right padding).
+        audio_silence_prefix_seconds (`float`, *optional*, defaults to 0.0):
+            The silence prefix in seconds to add before the audio (left padding).
+    """
+
+    model_input_names = ["input_values", "padding_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 24000,
+        padding_value: float = 0.0,
+        chunk_length_s: Optional[float] = None,
+        overlap: Optional[float] = None,
+        audio_delay_seconds: Optional[float] = 0.0,
+        audio_silence_prefix_seconds: Optional[float] = 0.0,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.chunk_length_s = chunk_length_s
+        self.overlap = overlap
+        self.audio_delay_seconds = audio_delay_seconds
+        self.audio_silence_prefix_seconds = audio_silence_prefix_seconds
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_length(self) -> Optional[int]:
+        if self.chunk_length_s is None:
+            return None
+        else:
+            return int(self.chunk_length_s * self.sampling_rate)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_stride(self) -> Optional[int]:
+        if self.chunk_length_s is None or self.overlap is None:
+            return None
+        else:
+            return max(1, int((1.0 - self.overlap) * self.chunk_length))
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.shape[-1] != 2:
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        padded_inputs = None
+        input_values = BatchFeature({"input_values": raw_audio})
+        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
+            if truncation:
+                max_length = min(array.shape[0] for array in raw_audio)
+                nb_step = int(np.floor(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+            elif padding:
+                max_length = max(array.shape[0] for array in raw_audio)
+                nb_step = int(np.ceil(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+                padding = "max_length"
+            else:
+                padded_inputs = input_values
+
+        # normal padding on batch
+        if padded_inputs is None:
+            padded_inputs = self.pad(
+                input_values,
+                max_length=max_length,
+                truncation=truncation,
+                padding=padding,
+                return_attention_mask=padding,
+            )
+
+            if padding:
+                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        # now let's padd left and right
+        pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
+        pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
+        padded_inputs["input_values"] = np.pad(
+            padded_inputs["input_values"],
+            ((0, 0), (pad_left, pad_right)),
+            mode="constant",
+            constant_values=0.0,
+        )
+        if padding:
+            padded_inputs["padding_mask"] = np.pad(
+                padded_inputs["padding_mask"],
+                ((0, 0), (pad_left, pad_right)),
+                mode="constant",
+                constant_values=0,
+            )
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["KyutaiSpeechToTextFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..641eec0634d8644ccb6d616de101a30ba48a05af
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
@@ -0,0 +1,1377 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_kyutai_speech_to_text.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Kyutai and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import types
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationConfig, GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModel
+from .configuration_kyutai_speech_to_text import KyutaiSpeechToTextConfig
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+class KyutaiSpeechToTextRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))  # Ignore copy
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    # Ignore copy
+    def forward(self, x):
+        output = self._norm(x.float())
+        output = output * self.weight.float()
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class KyutaiSpeechToTextFlexibleLinear(nn.Module):
+    def __init__(self, input_size, output_size, num_layers):
+        super().__init__()
+        # Stack the weights for N layers into a single tensor (num_layers, output_size, input_size)
+        self.weight = nn.Parameter(torch.randn(num_layers, output_size, input_size))
+
+    def forward(self, x, layer_idx=None):
+        """
+        `KyutaiSpeechToTextFlexibleLinear` creates one linear layer per codebook. There's multiple ways to use it.
+        In the default case, `sequence_length=num_layers`, so each element of the sequence will be matmul to the weights corresponding to its index on the sequence.
+
+        For more advanced cases, one can specify which codebook's layer(s) to use with `layer_idx`.
+        If `layer_idx` indicates a single integer, all of the element of the sequence will be matmul to this single codebook's layer.
+        But if `layer_idx` is a tensor of shape `(seq_length,)`, it will matmul each i-th element of the input sequence to the corresponding layer `weight[i]`.
+
+
+        Args:
+            x (`torch.FloatTensor): input to the layer of shape `(batch, num_layers, embed_dim)` or of shape `(batch, seq_length, embed_dim)`
+            layer_idx (`torch.Tensor`, *optional*):
+                Can be used to specify which codebook's layers(s) to use.
+                If it's a tensor of shape `(seq_length,)`, will matmul each element of the sequence to the corresponding weights.
+                But if `layer_idx` is a tensor of shape `(seq_length,)`, it will matmul each i-th element of the input sequence to the corresponding layer `weight[i]`.
+        """
+
+        # Use torch.gather to select the corresponding weights for each sample
+        # (codebooks, output_size, hidden_size)
+        selected_weights = torch.index_select(self.weight, 0, layer_idx) if layer_idx is not None else self.weight
+
+        # (1, codebooks, hidden_size, output_size)
+        selected_weights = selected_weights.transpose(1, 2)[None, :, :, :]
+
+        # (batch_size, codebooks, 1, hidden_size) x (1, codebooks, hidden_size, output_size)
+        # -> (batch_size, codebooks, 1, output_size)
+        x = torch.matmul(x[:, :, None, :], selected_weights)
+
+        # (batch_size, codebooks, output_size)
+        return x.squeeze(2)
+
+
+@auto_docstring
+class KyutaiSpeechToTextPreTrainedModel(PreTrainedModel):
+    config: KyutaiSpeechToTextConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["KyutaiSpeechToTextDecoderLayer", "MimiTransformerLayer"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    main_input_name = "input_ids"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, KyutaiSpeechToTextFlexibleLinear):
+            module.weight.data.normal_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, KyutaiSpeechToTextRMSNorm):
+            module.weight.data.fill_(1.0)
+
+
+class KyutaiSpeechToTextConv1dPaddingCache:
+    """
+    Padding cache for KyutaiSpeechToTextConv1d causal convolutions in order to support streaming via cache padding.
+    See: https://huggingface.co/papers/2005.06720 & https://huggingface.co/papers/2204.07064
+
+    A padding cache is a list of cached partial hidden states for each convolution layer.
+    Hidden states are cached from the previous call to the KyutaiSpeechToTextConv1d forward pass, given the padding size.
+    """
+
+    def __init__(
+        self,
+        num_layers: int,
+        per_layer_padding: list[int],
+        per_layer_padding_mode: list[str],
+        per_layer_in_channels: list[int],
+    ):
+        # ensure correct number of layers for each arg
+        from_args_num_layers = {len(per_layer_padding), len(per_layer_padding_mode), len(per_layer_in_channels)}
+
+        if len(from_args_num_layers) != 1 or from_args_num_layers.pop() != num_layers:
+            raise ValueError(
+                f"Expected `num_layers` ({num_layers}) values in `per_layer_padding`, `per_layer_padding_mode` and `per_layer_in_channels`"
+            )
+        elif not all(mode in ["constant", "replicate"] for mode in per_layer_padding_mode):
+            raise NotImplementedError(
+                "`padding_cache` is not supported for convolutions using other than `constant` or `replicate` padding mode"
+            )
+
+        self.per_layer_padding = per_layer_padding
+        self.per_layer_padding_mode = per_layer_padding_mode
+        self.per_layer_in_channels = per_layer_in_channels
+        self.per_layer_is_init = [True] * num_layers
+
+        self.padding_cache = [None] * num_layers
+
+    def update(self, hidden_states: torch.Tensor, layer_idx: int):
+        """
+        Updates the padding cache with the new padding states for the layer `layer_idx` and returns the current cache.
+
+        Parameters:
+            hidden_states (`torch.Tensor`):
+                The hidden states to be partially cached.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+        Returns:
+            `torch.Tensor` or `None`, the current padding cache.
+        """
+        batch_size, dtype, device = hidden_states.shape[0], hidden_states.dtype, hidden_states.device
+        padding = self.per_layer_padding[layer_idx]
+        padding_mode = self.per_layer_padding_mode[layer_idx]
+        in_channels = self.per_layer_in_channels[layer_idx]
+
+        if self.padding_cache[layer_idx] is None:
+            if padding_mode == "constant":
+                current_cache = torch.zeros(
+                    batch_size,
+                    in_channels,
+                    padding,
+                    device=device,
+                    dtype=dtype,
+                )
+            elif padding_mode == "replicate":
+                current_cache = (
+                    torch.ones(
+                        batch_size,
+                        in_channels,
+                        padding,
+                        device=device,
+                        dtype=dtype,
+                    )
+                    * hidden_states[..., :1]
+                )
+        else:
+            current_cache = self.padding_cache[layer_idx]
+
+        # update the cache
+        if padding > 0:
+            padding_states = hidden_states[:, :, -padding:]
+        else:
+            padding_states = torch.empty(batch_size, in_channels, padding, dtype=dtype, device=device)
+        self.padding_cache[layer_idx] = padding_states
+
+        return current_cache
+
+
+class KyutaiSpeechToTextEmbeddings(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size + (config.num_codebooks * config.codebook_vocab_size) + 1,
+            config.hidden_size,
+            padding_idx=config.audio_pad_token_id,
+        )
+        audio_tokens_offsets = torch.arange(config.num_codebooks) * config.codebook_vocab_size
+        audio_tokens_offsets += config.vocab_size
+        audio_tokens_offsets = nn.functional.pad(
+            audio_tokens_offsets, (1, 0)
+        )  # pad one 0 to the left for the text token
+        self.register_buffer("audio_tokens_offsets", audio_tokens_offsets, persistent=False)
+
+    def forward(self, input_ids):
+        input_ids = torch.where(
+            input_ids == self.embed_tokens.padding_idx, input_ids, input_ids + self.audio_tokens_offsets
+        )
+        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds.sum(dim=2)
+        return inputs_embeds
+
+
+class KyutaiSpeechToTextLinear(nn.Module):
+    def __init__(self, input_dim, output_dim, num_codebooks, use_flexible_linear=False):
+        super().__init__()
+
+        self.use_flexible_linear = use_flexible_linear
+
+        if not use_flexible_linear:
+            self.linear = nn.Linear(input_dim, output_dim, bias=False)
+        else:
+            self.linear = KyutaiSpeechToTextFlexibleLinear(input_dim, output_dim, num_layers=num_codebooks)
+
+    def forward(self, x, layer_idx=None):
+        if self.use_flexible_linear:
+            return self.linear(x, layer_idx)
+        else:
+            return self.linear(x)
+
+
+class KyutaiSpeechToTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: KyutaiSpeechToTextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class KyutaiSpeechToTextGatingMLP(nn.Module):
+    def __init__(self, config, use_flexible_linear=False):
+        super().__init__()
+
+        self.activation_fn = ACT2FN[config.hidden_act]
+        ffn_dim = config.ffn_dim
+        hidden_size = config.hidden_size
+        num_layers = config.num_codebooks if use_flexible_linear else 1
+        if num_layers == 1:
+            self.fc1 = nn.Linear(hidden_size, ffn_dim, bias=False)
+            self.fc2 = nn.Linear(ffn_dim // 2, hidden_size, bias=False)
+        else:
+            self.fc1 = KyutaiSpeechToTextFlexibleLinear(hidden_size, ffn_dim, num_layers)
+            self.fc2 = KyutaiSpeechToTextFlexibleLinear(ffn_dim // 2, hidden_size, num_layers)
+
+    def forward(self, hidden_states: torch.Tensor, layer_idx: Optional[int] = None) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states) if layer_idx is None else self.fc1(hidden_states, layer_idx)
+
+        batch_size, sequence_length, _ = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, sequence_length, 2, -1)
+        hidden_states = self.activation_fn(hidden_states[..., 0, :]) * hidden_states[..., 1, :]
+        hidden_states = self.fc2(hidden_states) if layer_idx is None else self.fc2(hidden_states, layer_idx)
+        return hidden_states
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class KyutaiSpeechToTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: KyutaiSpeechToTextConfig,
+        layer_idx: Optional[int] = None,
+        use_flexible_linear=False,
+        use_rope=True,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.is_causal = True
+        self.scaling = 1 / math.sqrt(self.head_dim)
+
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = KyutaiSpeechToTextLinear(
+            self.hidden_size, self.num_heads * self.head_dim, config.num_codebooks, use_flexible_linear
+        )
+        self.k_proj = KyutaiSpeechToTextLinear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, config.num_codebooks, use_flexible_linear
+        )
+        self.v_proj = KyutaiSpeechToTextLinear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, config.num_codebooks, use_flexible_linear
+        )
+        self.o_proj = KyutaiSpeechToTextLinear(
+            self.num_heads * self.head_dim, self.hidden_size, config.num_codebooks, use_flexible_linear
+        )
+
+        # rotary embeddings are not used in the depth decoder
+        self.rotary_emb = None
+        if use_rope:
+            self.rope_theta = config.rope_theta
+            self.rotary_emb = KyutaiSpeechToTextRotaryEmbedding(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states, cache_position)  # Ignore copy
+        key_states = self.k_proj(hidden_states, cache_position)  # Ignore copy
+        value_states = self.v_proj(hidden_states, cache_position)  # Ignore copy
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if self.rotary_emb is not None:  # Ignore copy
+            cos, sin = self.rotary_emb(value_states, position_ids)  # Ignore copy
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)  # Ignore copy
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = (
+                {"sin": sin, "cos": cos, "cache_position": cache_position}
+                if self.rotary_emb is not None
+                else {"cache_position": cache_position}
+            )  # Ignore copy
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output, cache_position)  # Ignore copy
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+# NO LONGER EXIST Copied from transformers.models.gemma.modeling_gemma.GemmaFlashAttention2 with Gemma->KyutaiSpeechToText
+# TODO cyril: modular
+class KyutaiSpeechToTextFlashAttention2(KyutaiSpeechToTextAttention):
+    """
+    KyutaiSpeechToText flash attention module. This module inherits from `KyutaiSpeechToTextAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        if isinstance(past_key_values, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states, cache_position)  # Ignore copy
+        key_states = self.k_proj(hidden_states, cache_position)  # Ignore copy
+        value_states = self.v_proj(hidden_states, cache_position)  # Ignore copy
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if self.rotary_emb is not None:  # Ignore copy
+            cos, sin = self.rotary_emb(value_states, position_ids)  # Ignore copy
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)  # Ignore copy
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = (
+                {"sin": sin, "cos": cos, "cache_position": cache_position}
+                if self.rotary_emb is not None
+                else {"cache_position": cache_position}
+            )  # Ignore copy
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (KyutaiSpeechToTextRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output, cache_position)  # Ignore copy
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+# NO LONGER EXIST Copied from transformers.models.gemma.modeling_gemma.GemmaSdpaAttention with Gemma->KyutaiSpeechToText
+# TODO cyril: modular
+class KyutaiSpeechToTextSdpaAttention(KyutaiSpeechToTextAttention):
+    """
+    KyutaiSpeechToText attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `KyutaiSpeechToTextAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from KyutaiSpeechToTextAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "KyutaiSpeechToTextModel is using KyutaiSpeechToTextSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states, cache_position)  # Ignore copy
+        key_states = self.k_proj(hidden_states, cache_position)  # Ignore copy
+        value_states = self.v_proj(hidden_states, cache_position)  # Ignore copy
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if self.rotary_emb is not None:  # Ignore copy
+            cos, sin = self.rotary_emb(value_states, position_ids)  # Ignore copy
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)  # Ignore copy
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = (
+                {"sin": sin, "cos": cos, "cache_position": cache_position}
+                if self.rotary_emb is not None
+                else {"cache_position": cache_position}
+            )  # Ignore copy
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = causal_mask is None and q_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output, cache_position)  # Ignore copy
+
+        return attn_output, None
+
+
+KYUTAI_SPEECH_TO_TEXT_ATTENTION_CLASSES = {
+    "eager": KyutaiSpeechToTextAttention,
+    "flash_attention_2": KyutaiSpeechToTextFlashAttention2,
+    "sdpa": KyutaiSpeechToTextSdpaAttention,
+}
+
+
+class KyutaiSpeechToTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: KyutaiSpeechToTextConfig, layer_idx: int, use_flexible_linear: bool, use_rope=True):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.use_flexible_linear = use_flexible_linear
+
+        self.self_attn = KYUTAI_SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
+            config=config, layer_idx=layer_idx, use_flexible_linear=use_flexible_linear, use_rope=use_rope
+        )
+
+        self.mlp = KyutaiSpeechToTextGatingMLP(config, use_flexible_linear)
+        self.input_layernorm = KyutaiSpeechToTextRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = KyutaiSpeechToTextRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.sliding_window = config.sliding_window
+
+        self._attn_implementation = config._attn_implementation
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = (
+            self.mlp(hidden_states) if not self.use_flexible_linear else self.mlp(hidden_states, cache_position)
+        )
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class KyutaiSpeechToTextModel(KyutaiSpeechToTextPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = KyutaiSpeechToTextEmbeddings(config)
+        self.layers = nn.ModuleList(
+            [
+                KyutaiSpeechToTextDecoderLayer(config, layer_idx, use_flexible_linear=False)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = KyutaiSpeechToTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = None
+        if attention_mask is not None:
+            causal_mask = self._update_causal_mask(
+                attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+            )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns] if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of KyutaiSpeechToText. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # StaticCache
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: KyutaiSpeechToTextConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`KyutaiSpeechToTextConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
+                -1, 1
+            )
+            text_config = config.get_text_config()
+            if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                is_static_sliding_cache = isinstance(past_key_values, StaticCache) and all(past_key_values.is_sliding)
+                if not is_static_sliding_cache or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
+                        cache_position.reshape(-1, 1) - text_config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
+
+@auto_docstring
+class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    _keep_in_fp32_modules_strict = ["codec_model"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = KyutaiSpeechToTextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.codec_model = AutoModel.from_config(config.codec_config)
+
+        # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
+        # yet the codec_model needs a generation config to initalize it's cache for streaming inference
+        # we therefore initialize a generation config for the codec model
+        self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from datasets import load_dataset, Audio
+        >>> from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
+
+        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> model_id = "kyutai/stt-2.6b-en-trfs"
+
+        >>> processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+        >>> model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+
+        >>> ds = load_dataset(
+        ...     "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+        ... )
+
+        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+        >>> inputs = processor(
+        ...     ds[0]["audio"]["array"],
+        ... )
+        >>> inputs.to(torch_device)
+
+        >>> output_tokens = model.generate(**inputs)
+        >>> print(processor.batch_decode(output_tokens, skip_special_tokens=True))
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def _prepare_generation_config(self, *args, **kwargs):
+        generation_config, model_kwargs = super()._prepare_generation_config(*args, **kwargs)
+        # this should be passed to the model kwargs for the input preparation
+        model_kwargs["audio_window_size"] = (
+            generation_config.audio_window_size if hasattr(generation_config, "audio_window_size") else None
+        )
+        return generation_config, model_kwargs
+
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[torch.Tensor] = None,
+        model_kwargs: Optional[dict[str, torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]:
+        inputs, input_name, model_kwargs = super()._prepare_model_inputs(
+            inputs=inputs,
+            bos_token_id=bos_token_id,
+            model_kwargs=model_kwargs,
+        )
+
+        audio_window_size = model_kwargs.get("audio_window_size", None)
+        if audio_window_size is None:
+            audio_window_size = self.codec_model.get_encoded_length(model_kwargs["input_values"].shape[-1]).item()
+            model_kwargs["audio_window_size"] = audio_window_size
+
+        batch_size = inputs.shape[0]
+        device = inputs.device
+
+        # initialize audio tokens
+        model_kwargs["audio_tokens"] = torch.zeros(
+            (batch_size, audio_window_size, self.config.num_codebooks),
+            device=device,
+            dtype=torch.long,
+        )
+        model_kwargs["current_window"] = (
+            torch.tensor([0, 0], device=device, dtype=torch.long).expand(batch_size, -1).contiguous()
+        )
+
+        # let's use generate's cache preparation to prepare the cache for the codec model
+        temporary_model_kwargs = {}
+
+        # monkey patching the codec model with cache preparation methods since we don't want it to inherit fully from GenerationMixin
+        # Add cache-related methods from GenerationMixin to codec model
+        cache_methods = [
+            "_prepare_cache_for_generation",
+            "_get_cache",
+        ]
+        for method in cache_methods:
+            setattr(self.codec_model, method, types.MethodType(getattr(self, method).__func__, self.codec_model))
+
+        setattr(
+            self.codec_model, "_supports_default_dynamic_cache", types.MethodType(lambda x: True, self.codec_model)
+        )
+
+        self.codec_model.generation_config.cache_implementation = "dynamic"
+        self.codec_model._prepare_cache_for_generation(
+            generation_config=self.codec_model.generation_config,
+            model_kwargs=temporary_model_kwargs,
+            assistant_model=None,
+            batch_size=batch_size,
+            max_cache_length=self.config.codec_config.sliding_window,
+        )
+
+        if "past_key_values" in temporary_model_kwargs:
+            model_kwargs["encoder_past_key_values"] = temporary_model_kwargs["past_key_values"]
+
+        # initialize the padding cache for the codec model
+        per_layer_padding, per_layer_padding_mode, per_layer_in_channels = [], [], []
+        for layer_name in self.codec_model.encoder._mimiconv1d_layer_names:
+            per_layer_padding.append(self.codec_model.encoder.get_submodule(layer_name).padding_total)
+            per_layer_padding_mode.append(self.codec_model.encoder.get_submodule(layer_name).pad_mode)
+            per_layer_in_channels.append(self.codec_model.encoder.get_submodule(layer_name).in_channels)
+
+        # downsample layer
+        per_layer_padding.append(self.codec_model.downsample.padding_total)
+        per_layer_padding_mode.append(self.codec_model.downsample.pad_mode)
+        per_layer_in_channels.append(self.codec_model.downsample.in_channels)
+
+        model_kwargs["padding_cache"] = KyutaiSpeechToTextConv1dPaddingCache(
+            num_layers=len(self.codec_model.encoder._mimiconv1d_layer_names) + 1,
+            per_layer_padding=per_layer_padding,
+            per_layer_padding_mode=per_layer_padding_mode,
+            per_layer_in_channels=per_layer_in_channels,
+        )
+
+        return inputs, input_name, model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        *args,
+        audio_tokens: Optional[torch.LongTensor] = None,
+        input_values: Optional[torch.FloatTensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        audio_window_size: Optional[int] = None,
+        current_window: Optional[tuple[int, int]] = None,
+        encoder_past_key_values: Optional[Cache] = None,
+        padding_cache: Optional[KyutaiSpeechToTextConv1dPaddingCache] = None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
+
+        if input_values is not None:
+            cache_position = model_inputs["cache_position"]
+            start, end = current_window[0]
+
+            # first cache position is for bos token, so we need to offset by -1
+            if cache_position[-1] - 1 >= end:
+                # we need to encode the new audio tokens
+                with torch.no_grad():
+                    input_values_start_idx = start * self.config.frame_size
+                    input_values_end_idx = (start + audio_window_size) * self.config.frame_size
+                    current_input_values = input_values[..., input_values_start_idx:input_values_end_idx]
+                    codec_model_output = self.codec_model.encode(
+                        current_input_values,
+                        encoder_past_key_values=encoder_past_key_values,
+                        padding_cache=padding_cache,
+                    )
+                    new_audio_tokens = codec_model_output.audio_codes.transpose(1, 2)
+
+                audio_tokens.copy_(new_audio_tokens)
+
+                start = end.clone()
+                end = end + audio_window_size
+                current_window.copy_(
+                    torch.tensor([start, end], device=current_window.device).expand(current_window.shape[0], -1)
+                )
+
+            # first cache position is for bos token, so we need to offset by -1
+            current_audio_tokens_idxs = (cache_position - start - 1).clamp(min=0)
+            current_audio_tokens = audio_tokens[:, current_audio_tokens_idxs, :]
+
+            current_audio_tokens[:, cache_position == 0, :] = self.config.audio_bos_token_id
+
+            input_ids = model_inputs.pop("input_ids")
+            input_ids = torch.cat(
+                [input_ids.unsqueeze(2), current_audio_tokens],
+                dim=2,
+            )
+            model_inputs["input_ids"] = input_ids
+
+        return model_inputs
+
+    # TODO: @eustlb, this should be standardized
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        if kwargs.get("output_loading_info", False):
+            model, loading_info = super().from_pretrained(*args, **kwargs)
+        else:
+            model = super().from_pretrained(*args, **kwargs)
+
+        # copy depth decoder generation conf attr to the depth decoder generation config
+        prefix = "codec_"
+        prefix_len = len(prefix)
+        codec_model_attrs = {
+            attr[prefix_len:]: value
+            for attr, value in vars(model.generation_config).items()
+            if attr.startswith(prefix)
+        }
+
+        vars(model.codec_model.generation_config).update({"_from_model_config": False, **codec_model_attrs})
+
+        # remove the depth decoder generation conf attr from the model generation config
+        for attr in codec_model_attrs:
+            delattr(model.generation_config, prefix + attr)
+
+        if "output_loading_info" in kwargs:
+            return model, loading_info
+        else:
+            return model
+
+    # TODO: @eustlb, this should be standardized
+    def save_pretrained(self, *args, **kwargs):
+        prefix = "codec_"
+        codec_model_attrs = self.codec_model.generation_config.to_diff_dict()
+        codec_model_attrs.pop("transformers_version", None)
+        for attr, value in codec_model_attrs.items():
+            setattr(self.generation_config, prefix + attr, value)
+
+        super().save_pretrained(*args, **kwargs)
+
+    def generate(self, *args, **kwargs):
+        r"""
+        This method forwards all its arguments to GenerationMixin's [`~GenerationMixin.generate`]. Please refer to the docstring of this method for more information.
+        """
+        max_new_tokens = kwargs.pop("max_new_tokens", None)
+        input_values = kwargs.get("input_values")
+
+        # TODO: @eustlb, we should have per-batch-idx values
+        # here we do not use padding_mask to be aligned to what's done in the original codebase
+        max_audio_frames = input_values.shape[-1] // self.config.codec_config.frame_size
+
+        if max_new_tokens is None or max_new_tokens > max_audio_frames:
+            if max_new_tokens is not None:
+                logger.warning(
+                    f"`max_new_tokens` ({max_new_tokens}) is greater than the maximum number of audio frames ({max_audio_frames})."
+                    f"Setting `max_new_tokens` to {max_audio_frames}."
+                )
+            max_new_tokens = max_audio_frames
+
+        return super().generate(
+            *args,
+            max_new_tokens=max_new_tokens,
+            **kwargs,
+        )
+
+
+__all__ = [
+    "KyutaiSpeechToTextPreTrainedModel",
+    "KyutaiSpeechToTextModel",
+    "KyutaiSpeechToTextForConditionalGeneration",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b442b2edbd8b94058f804c33a91ea8e3d3abc5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
@@ -0,0 +1,513 @@
+# coding=utf-8
+# Copyright 2025 Kyutai and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import types
+from typing import Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache
+from ...feature_extraction_utils import BatchFeature
+from ...generation import GenerationConfig, GenerationMixin
+from ...modeling_utils import PreTrainedModel
+from ...utils import PaddingStrategy, TensorType, logging
+from ..auto import AutoModel
+from ..encodec.feature_extraction_encodec import EncodecFeatureExtractor
+from ..llama.modeling_llama import LlamaForCausalLM
+from ..mimi.modeling_mimi import MimiConv1dPaddingCache
+from ..moshi.modeling_moshi import MoshiModel, MoshiPreTrainedModel
+
+
+logger = logging.get_logger(__name__)
+
+
+class KyutaiSpeechToTextFeatureExtractor(EncodecFeatureExtractor):
+    r"""
+    Constructs an KyutaiSpeechToText feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+        audio_delay_seconds (`float`, *optional*, defaults to 0.0):
+            The delay in seconds to add after the audio (right padding).
+        audio_silence_prefix_seconds (`float`, *optional*, defaults to 0.0):
+            The silence prefix in seconds to add before the audio (left padding).
+    """
+
+    def __init__(
+        self,
+        audio_delay_seconds: Optional[float] = 0.0,
+        audio_silence_prefix_seconds: Optional[float] = 0.0,
+        **super_kwargs,
+    ):
+        super().__init__(**super_kwargs)
+        self.audio_delay_seconds = audio_delay_seconds
+        self.audio_silence_prefix_seconds = audio_silence_prefix_seconds
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.shape[-1] != 2:
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        padded_inputs = None
+        input_values = BatchFeature({"input_values": raw_audio})
+        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
+            if truncation:
+                max_length = min(array.shape[0] for array in raw_audio)
+                nb_step = int(np.floor(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+            elif padding:
+                max_length = max(array.shape[0] for array in raw_audio)
+                nb_step = int(np.ceil(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+                padding = "max_length"
+            else:
+                padded_inputs = input_values
+
+        # normal padding on batch
+        if padded_inputs is None:
+            padded_inputs = self.pad(
+                input_values,
+                max_length=max_length,
+                truncation=truncation,
+                padding=padding,
+                return_attention_mask=padding,
+            )
+
+            if padding:
+                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        # now let's padd left and right
+        pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
+        pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
+        padded_inputs["input_values"] = np.pad(
+            padded_inputs["input_values"],
+            ((0, 0), (pad_left, pad_right)),
+            mode="constant",
+            constant_values=0.0,
+        )
+        if padding:
+            padded_inputs["padding_mask"] = np.pad(
+                padded_inputs["padding_mask"],
+                ((0, 0), (pad_left, pad_right)),
+                mode="constant",
+                constant_values=0,
+            )
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+class KyutaiSpeechToTextPreTrainedModel(MoshiPreTrainedModel):
+    pass
+
+
+class KyutaiSpeechToTextConv1dPaddingCache(MimiConv1dPaddingCache):
+    pass
+
+
+class KyutaiSpeechToTextEmbeddings(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size + (config.num_codebooks * config.codebook_vocab_size) + 1,
+            config.hidden_size,
+            padding_idx=config.audio_pad_token_id,
+        )
+        audio_tokens_offsets = torch.arange(config.num_codebooks) * config.codebook_vocab_size
+        audio_tokens_offsets += config.vocab_size
+        audio_tokens_offsets = nn.functional.pad(
+            audio_tokens_offsets, (1, 0)
+        )  # pad one 0 to the left for the text token
+        self.register_buffer("audio_tokens_offsets", audio_tokens_offsets, persistent=False)
+
+    def forward(self, input_ids):
+        input_ids = torch.where(
+            input_ids == self.embed_tokens.padding_idx, input_ids, input_ids + self.audio_tokens_offsets
+        )
+        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds.sum(dim=2)
+        return inputs_embeds
+
+
+class KyutaiSpeechToTextModel(MoshiModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_tokens = KyutaiSpeechToTextEmbeddings(config)
+
+
+class KyutaiSpeechToTextForConditionalGeneration(LlamaForCausalLM, GenerationMixin):
+    _keep_in_fp32_modules_strict = ["codec_model"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.codec_model = AutoModel.from_config(config.codec_config)
+
+        # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
+        # yet the codec_model needs a generation config to initalize it's cache for streaming inference
+        # we therefore initialize a generation config for the codec model
+        self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)
+
+    def forward(self, **super_kwargs):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from datasets import load_dataset, Audio
+        >>> from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
+
+        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> model_id = "kyutai/stt-2.6b-en-trfs"
+
+        >>> processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+        >>> model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+
+        >>> ds = load_dataset(
+        ...     "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+        ... )
+
+        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+        >>> inputs = processor(
+        ...     ds[0]["audio"]["array"],
+        ... )
+        >>> inputs.to(torch_device)
+
+        >>> output_tokens = model.generate(**inputs)
+        >>> print(processor.batch_decode(output_tokens, skip_special_tokens=True))
+        ```"""
+        super().forward(**super_kwargs)
+
+    def _prepare_generation_config(self, *args, **kwargs):
+        generation_config, model_kwargs = GenerationMixin._prepare_generation_config(self, *args, **kwargs)
+        # this should be passed to the model kwargs for the input preparation
+        model_kwargs["audio_window_size"] = (
+            generation_config.audio_window_size if hasattr(generation_config, "audio_window_size") else None
+        )
+        return generation_config, model_kwargs
+
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[torch.Tensor] = None,
+        model_kwargs: Optional[dict[str, torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]:
+        inputs, input_name, model_kwargs = GenerationMixin._prepare_model_inputs(
+            self,
+            inputs=inputs,
+            bos_token_id=bos_token_id,
+            model_kwargs=model_kwargs,
+        )
+
+        audio_window_size = model_kwargs.get("audio_window_size", None)
+        if audio_window_size is None:
+            audio_window_size = self.codec_model.get_encoded_length(model_kwargs["input_values"].shape[-1]).item()
+            model_kwargs["audio_window_size"] = audio_window_size
+
+        batch_size = inputs.shape[0]
+        device = inputs.device
+
+        # initialize audio tokens
+        model_kwargs["audio_tokens"] = torch.zeros(
+            (batch_size, audio_window_size, self.config.num_codebooks),
+            device=device,
+            dtype=torch.long,
+        )
+        model_kwargs["current_window"] = (
+            torch.tensor([0, 0], device=device, dtype=torch.long).expand(batch_size, -1).contiguous()
+        )
+
+        # let's use generate's cache preparation to prepare the cache for the codec model
+        temporary_model_kwargs = {}
+
+        # monkey patching the codec model with cache preparation methods since we don't want it to inherit fully from GenerationMixin
+        # Add cache-related methods from GenerationMixin to codec model
+        cache_methods = [
+            "_prepare_cache_for_generation",
+            "_get_cache",
+        ]
+        for method in cache_methods:
+            setattr(self.codec_model, method, types.MethodType(getattr(self, method).__func__, self.codec_model))
+
+        setattr(
+            self.codec_model, "_supports_default_dynamic_cache", types.MethodType(lambda x: True, self.codec_model)
+        )
+
+        self.codec_model.generation_config.cache_implementation = "dynamic"
+        self.codec_model._prepare_cache_for_generation(
+            generation_config=self.codec_model.generation_config,
+            model_kwargs=temporary_model_kwargs,
+            assistant_model=None,
+            batch_size=batch_size,
+            max_cache_length=self.config.codec_config.sliding_window,
+        )
+
+        if "past_key_values" in temporary_model_kwargs:
+            model_kwargs["encoder_past_key_values"] = temporary_model_kwargs["past_key_values"]
+
+        # initialize the padding cache for the codec model
+        per_layer_padding, per_layer_padding_mode, per_layer_in_channels = [], [], []
+        for layer_name in self.codec_model.encoder._mimiconv1d_layer_names:
+            per_layer_padding.append(self.codec_model.encoder.get_submodule(layer_name).padding_total)
+            per_layer_padding_mode.append(self.codec_model.encoder.get_submodule(layer_name).pad_mode)
+            per_layer_in_channels.append(self.codec_model.encoder.get_submodule(layer_name).in_channels)
+
+        # downsample layer
+        per_layer_padding.append(self.codec_model.downsample.padding_total)
+        per_layer_padding_mode.append(self.codec_model.downsample.pad_mode)
+        per_layer_in_channels.append(self.codec_model.downsample.in_channels)
+
+        model_kwargs["padding_cache"] = KyutaiSpeechToTextConv1dPaddingCache(
+            num_layers=len(self.codec_model.encoder._mimiconv1d_layer_names) + 1,
+            per_layer_padding=per_layer_padding,
+            per_layer_padding_mode=per_layer_padding_mode,
+            per_layer_in_channels=per_layer_in_channels,
+        )
+
+        return inputs, input_name, model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        *args,
+        audio_tokens: Optional[torch.LongTensor] = None,
+        input_values: Optional[torch.FloatTensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        audio_window_size: Optional[int] = None,
+        current_window: Optional[tuple[int, int]] = None,
+        encoder_past_key_values: Optional[Cache] = None,
+        padding_cache: Optional[KyutaiSpeechToTextConv1dPaddingCache] = None,
+        **kwargs,
+    ):
+        model_inputs = GenerationMixin.prepare_inputs_for_generation(self, *args, **kwargs)
+
+        if input_values is not None:
+            cache_position = model_inputs["cache_position"]
+            start, end = current_window[0]
+
+            # first cache position is for bos token, so we need to offset by -1
+            if cache_position[-1] - 1 >= end:
+                # we need to encode the new audio tokens
+                with torch.no_grad():
+                    input_values_start_idx = start * self.config.frame_size
+                    input_values_end_idx = (start + audio_window_size) * self.config.frame_size
+                    current_input_values = input_values[..., input_values_start_idx:input_values_end_idx]
+                    codec_model_output = self.codec_model.encode(
+                        current_input_values,
+                        encoder_past_key_values=encoder_past_key_values,
+                        padding_cache=padding_cache,
+                    )
+                    new_audio_tokens = codec_model_output.audio_codes.transpose(1, 2)
+
+                audio_tokens.copy_(new_audio_tokens)
+
+                start = end.clone()
+                end = end + audio_window_size
+                current_window.copy_(
+                    torch.tensor([start, end], device=current_window.device).expand(current_window.shape[0], -1)
+                )
+
+            # first cache position is for bos token, so we need to offset by -1
+            current_audio_tokens_idxs = (cache_position - start - 1).clamp(min=0)
+            current_audio_tokens = audio_tokens[:, current_audio_tokens_idxs, :]
+
+            current_audio_tokens[:, cache_position == 0, :] = self.config.audio_bos_token_id
+
+            input_ids = model_inputs.pop("input_ids")
+            input_ids = torch.cat(
+                [input_ids.unsqueeze(2), current_audio_tokens],
+                dim=2,
+            )
+            model_inputs["input_ids"] = input_ids
+
+        return model_inputs
+
+    # TODO: @eustlb, this should be standardized
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        if kwargs.get("output_loading_info", False):
+            model, loading_info = PreTrainedModel.from_pretrained(*args, **kwargs)
+        else:
+            model = PreTrainedModel.from_pretrained(*args, **kwargs)
+
+        # copy depth decoder generation conf attr to the depth decoder generation config
+        prefix = "codec_"
+        prefix_len = len(prefix)
+        codec_model_attrs = {
+            attr[prefix_len:]: value
+            for attr, value in vars(model.generation_config).items()
+            if attr.startswith(prefix)
+        }
+
+        vars(model.codec_model.generation_config).update({"_from_model_config": False, **codec_model_attrs})
+
+        # remove the depth decoder generation conf attr from the model generation config
+        for attr in codec_model_attrs:
+            delattr(model.generation_config, prefix + attr)
+
+        if "output_loading_info" in kwargs:
+            return model, loading_info
+        else:
+            return model
+
+    # TODO: @eustlb, this should be standardized
+    def save_pretrained(self, *args, **kwargs):
+        prefix = "codec_"
+        codec_model_attrs = self.codec_model.generation_config.to_diff_dict()
+        codec_model_attrs.pop("transformers_version", None)
+        for attr, value in codec_model_attrs.items():
+            setattr(self.generation_config, prefix + attr, value)
+
+        PreTrainedModel.save_pretrained(self, *args, **kwargs)
+
+    def generate(self, *args, **kwargs):
+        r"""
+        This method forwards all its arguments to GenerationMixin's [`~GenerationMixin.generate`]. Please refer to the docstring of this method for more information.
+        """
+        max_new_tokens = kwargs.pop("max_new_tokens", None)
+        input_values = kwargs.get("input_values")
+
+        # TODO: @eustlb, we should have per-batch-idx values
+        # here we do not use padding_mask to be aligned to what's done in the original codebase
+        max_audio_frames = input_values.shape[-1] // self.config.codec_config.frame_size
+
+        if max_new_tokens is None or max_new_tokens > max_audio_frames:
+            if max_new_tokens is not None:
+                logger.warning(
+                    f"`max_new_tokens` ({max_new_tokens}) is greater than the maximum number of audio frames ({max_audio_frames})."
+                    f"Setting `max_new_tokens` to {max_audio_frames}."
+                )
+            max_new_tokens = max_audio_frames
+
+        return GenerationMixin.generate(
+            *args,
+            max_new_tokens=max_new_tokens,
+            **kwargs,
+        )
+
+
+__all__ = [
+    "KyutaiSpeechToTextPreTrainedModel",
+    "KyutaiSpeechToTextModel",
+    "KyutaiSpeechToTextForConditionalGeneration",
+    "KyutaiSpeechToTextFeatureExtractor",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..2641c749483e03c379edef368fde953eac121447
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from ...audio_utils import AudioInput, make_list_of_audio
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+
+
+class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "audio_kwargs": {
+            "sampling_rate": 24000,
+        },
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
+class KyutaiSpeechToTextProcessor(ProcessorMixin):
+    r"""
+    Constructs a Moshi ASR processor which wraps [`EncodecFeatureExtractor`] and
+    [`PreTrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
+    tokenizer functionalities. See the [`~KyutaiSpeechToTextProcessor.__call__`] for more
+    information.
+    """
+
+    feature_extractor_class = "KyutaiSpeechToTextFeatureExtractor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+
+    def __call__(
+        self,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[KyutaiSpeechToTextProcessorKwargs],
+    ):
+        r"""
+        Main method to prepare audio to be fed as input to the model. This method forwards the `audio`
+        arguments to KyutaiSpeechToTextFeatureExtractor's [`~KyutaiSpeechToTextFeatureExtractor.__call__`]. Please refer
+        to the docstring of the above method for more information.
+
+        Args:
+            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
+                tensor.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
+            - **padding_mask** -- List of indices specifying which input values should be ignored by the model.
+        """
+
+        if audio is None:
+            raise ValueError("`audio` is required.")
+
+        output_kwargs = self._merge_kwargs(
+            KyutaiSpeechToTextProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        audio_kwargs = output_kwargs["audio_kwargs"]
+
+        # ensure audio in correct format
+        audio = make_list_of_audio(audio)
+
+        inputs = self.feature_extractor(
+            audio,
+            **audio_kwargs,
+        )
+
+        return inputs
+
+
+__all__ = ["KyutaiSpeechToTextProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/__init__.py b/phivenv/Lib/site-packages/transformers/models/layoutlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f079c33c7157898807c5f405086804cc1533ff9
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlm/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_layoutlm import *
+    from .modeling_layoutlm import *
+    from .modeling_tf_layoutlm import *
+    from .tokenization_layoutlm import *
+    from .tokenization_layoutlm_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05bd8e579288eb6703482f2f948d864647b00b2b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/configuration_layoutlm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/configuration_layoutlm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5440ade0ec2f1881a6ac4a3fa00b0d05c88f8d31
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/configuration_layoutlm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/modeling_layoutlm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/modeling_layoutlm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b98ddd7261f50a2a8ce2c8e5a5b291ab5e7b8fed
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/modeling_layoutlm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/modeling_tf_layoutlm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/modeling_tf_layoutlm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be081886df001a3e4e97bcb0649009216dac43bc
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/modeling_tf_layoutlm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..067d3287aef8c72f79047c25e1037710540525f2
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb0b741086f5eb936fbaadac5477349d794ee367
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/configuration_layoutlm.py b/phivenv/Lib/site-packages/transformers/models/layoutlm/configuration_layoutlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..95bc2eda6faba3d0f507344d53d553f818648f8b
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlm/configuration_layoutlm.py
@@ -0,0 +1,213 @@
+# coding=utf-8
+# Copyright 2010, The Microsoft Research Asia LayoutLM Team authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LayoutLM model configuration"""
+
+import warnings
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PretrainedConfig, PreTrainedTokenizer
+from ...onnx import OnnxConfig, PatchingSpec
+from ...utils import TensorType, is_torch_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class LayoutLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LayoutLMModel`]. It is used to instantiate a
+    LayoutLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the LayoutLM
+    [microsoft/layoutlm-base-uncased](https://huggingface.co/microsoft/layoutlm-base-uncased) architecture.
+
+    Configuration objects inherit from [`BertConfig`] and can be used to control the model outputs. Read the
+    documentation from [`BertConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the LayoutLM model. Defines the different tokens that can be represented by the
+            *inputs_ids* passed to the forward method of [`LayoutLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into [`LayoutLMModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The value used to pad input_ids.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum value that the 2D position embedding might ever used. Typically set this to something large
+            just in case (e.g., 1024).
+
+    Examples:
+
+    ```python
+    >>> from transformers import LayoutLMConfig, LayoutLMModel
+
+    >>> # Initializing a LayoutLM configuration
+    >>> configuration = LayoutLMConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = LayoutLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "layoutlm"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        max_2d_position_embeddings=1024,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self._position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+
+    @property
+    def position_embedding_type(self):
+        warnings.warn(
+            "The `position_embedding_type` attribute is deprecated and will be removed in v4.55.",
+            FutureWarning,
+        )
+        return self._position_embedding_type
+
+    @position_embedding_type.setter
+    def position_embedding_type(self, value):
+        self._position_embedding_type = value
+
+
+class LayoutLMOnnxConfig(OnnxConfig):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs)
+        self.max_2d_positions = config.max_2d_position_embeddings - 1
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("bbox", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+                ("token_type_ids", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        """
+        Generate inputs to provide to the ONNX exporter for the specific framework
+
+        Args:
+            tokenizer: The tokenizer associated with this model configuration
+            batch_size: The batch size (int) to export the model for (-1 means dynamic axis)
+            seq_length: The sequence length (int) to export the model for (-1 means dynamic axis)
+            is_pair: Indicate if the input is a pair (sentence 1, sentence 2)
+            framework: The framework (optional) the tokenizer will generate tensor for
+
+        Returns:
+            Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
+        """
+
+        input_dict = super().generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # Generate a dummy bbox
+        box = [48, 84, 73, 128]
+
+        if not framework == TensorType.PYTORCH:
+            raise NotImplementedError("Exporting LayoutLM to ONNX is currently only supported for PyTorch.")
+
+        if not is_torch_available():
+            raise ValueError("Cannot generate dummy inputs without PyTorch installed.")
+        import torch
+
+        batch_size, seq_length = input_dict["input_ids"].shape
+        input_dict["bbox"] = torch.tensor([*[box] * seq_length]).tile(batch_size, 1, 1)
+        return input_dict
+
+
+__all__ = ["LayoutLMConfig", "LayoutLMOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/modeling_layoutlm.py b/phivenv/Lib/site-packages/transformers/models/layoutlm/modeling_layoutlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b79ef99d38cc1e2932e4d759f8719fd739cfc3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlm/modeling_layoutlm.py
@@ -0,0 +1,1145 @@
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LayoutLM model."""
+
+from typing import Callable, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, can_return_tuple, logging
+from .configuration_layoutlm import LayoutLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+LayoutLMLayerNorm = nn.LayerNorm
+
+
+class LayoutLMEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        words_embeddings = inputs_embeds
+        position_embeddings = self.position_embeddings(position_ids)
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The `bbox`coordinate values should be within 0-1000 range.") from e
+
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = (
+            words_embeddings
+            + position_embeddings
+            + left_position_embeddings
+            + upper_position_embeddings
+            + right_position_embeddings
+            + lower_position_embeddings
+            + h_position_embeddings
+            + w_position_embeddings
+            + token_type_embeddings
+        )
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.align.modeling_align.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextSelfAttention with AlignText->LayoutLM
+class LayoutLMSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.attention_dropout = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.attention_head_size)
+
+        query_states = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.key(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->LayoutLM
+class LayoutLMSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextAttention with AlignText->LayoutLM
+class LayoutLMAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = LayoutLMSelfAttention(config)
+        self.output = LayoutLMSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class LayoutLMIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->LayoutLM
+class LayoutLMOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextLayer with AlignText->LayoutLM
+class LayoutLMLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = LayoutLMAttention(config)
+        self.intermediate = LayoutLMIntermediate(config)
+        self.output = LayoutLMOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextEncoder with AlignText->LayoutLM
+class LayoutLMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([LayoutLMLayer(config) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class LayoutLMPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->LayoutLM
+class LayoutLMPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->LayoutLM
+class LayoutLMLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = LayoutLMPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->LayoutLM
+class LayoutLMOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = LayoutLMLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@auto_docstring
+class LayoutLMPreTrainedModel(PreTrainedModel):
+    config: LayoutLMConfig
+    base_model_prefix = "layoutlm"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayoutLMLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, LayoutLMLMPredictionHead):
+            module.bias.data.zero_()
+
+
+@auto_docstring
+class LayoutLMModel(LayoutLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = LayoutLMEmbeddings(config)
+        self.encoder = LayoutLMEncoder(config)
+        self.pooler = LayoutLMPooler(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner. See [Overview](#Overview) for normalization.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LayoutLMModel
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = torch.tensor([token_boxes])
+
+        >>> outputs = model(
+        ...     input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids
+        ... )
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        if bbox is None:
+            bbox = torch.zeros(input_shape + (4,), dtype=torch.long, device=device)
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
+
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            bbox=bbox,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.layoutlm = LayoutLMModel(config)
+        self.cls = LayoutLMOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.layoutlm.embeddings.word_embeddings
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner. See [Overview](#Overview) for normalization.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LayoutLMForMaskedLM
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMForMaskedLM.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "[MASK]"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = torch.tensor([token_boxes])
+
+        >>> labels = tokenizer("Hello world", return_tensors="pt")["input_ids"]
+
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=labels,
+        ... )
+
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlm(
+            input_ids,
+            bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    LayoutLM Model with a sequence classification head on top (a linear layer on top of the pooled output) e.g. for
+    document image classification tasks such as the [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
+    """
+)
+class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.layoutlm = LayoutLMModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.layoutlm.embeddings.word_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner. See [Overview](#Overview) for normalization.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LayoutLMForSequenceClassification
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = torch.tensor([token_boxes])
+        >>> sequence_label = torch.tensor([1])
+
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=sequence_label,
+        ... )
+
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    sequence labeling (information extraction) tasks such as the [FUNSD](https://guillaumejaume.github.io/FUNSD/)
+    dataset and the [SROIE](https://rrc.cvc.uab.es/?ch=13) dataset.
+    """
+)
+class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.layoutlm = LayoutLMModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.layoutlm.embeddings.word_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner. See [Overview](#Overview) for normalization.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LayoutLMForTokenClassification
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = torch.tensor([token_boxes])
+        >>> token_labels = torch.tensor([1, 1, 0, 0]).unsqueeze(0)  # batch size of 1
+
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=token_labels,
+        ... )
+
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
+    def __init__(self, config, has_visual_segment_embedding=True):
+        r"""
+        has_visual_segment_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to add visual segment embeddings.
+        """
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.layoutlm = LayoutLMModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.layoutlm.embeddings.word_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner. See [Overview](#Overview) for normalization.
+
+        Example:
+
+        In the example below, we prepare a question + context pair for the LayoutLM model. It will give us a prediction
+        of what it thinks the answer is (the span of the answer within the texts parsed from the image).
+
+        ```python
+        >>> from transformers import AutoTokenizer, LayoutLMForQuestionAnswering
+        >>> from datasets import load_dataset
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
+        >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
+
+        >>> dataset = load_dataset("nielsr/funsd", split="train")
+        >>> example = dataset[0]
+        >>> question = "what's his name?"
+        >>> words = example["words"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = tokenizer(
+        ...     question.split(), words, is_split_into_words=True, return_token_type_ids=True, return_tensors="pt"
+        ... )
+        >>> bbox = []
+        >>> for i, s, w in zip(encoding.input_ids[0], encoding.sequence_ids(0), encoding.word_ids(0)):
+        ...     if s == 1:
+        ...         bbox.append(boxes[w])
+        ...     elif i == tokenizer.sep_token_id:
+        ...         bbox.append([1000] * 4)
+        ...     else:
+        ...         bbox.append([0] * 4)
+        >>> encoding["bbox"] = torch.tensor([bbox])
+
+        >>> word_ids = encoding.word_ids(0)
+        >>> outputs = model(**encoding)
+        >>> loss = outputs.loss
+        >>> start_scores = outputs.start_logits
+        >>> end_scores = outputs.end_logits
+        >>> start, end = word_ids[start_scores.argmax(-1)], word_ids[end_scores.argmax(-1)]
+        >>> print(" ".join(words[start : end + 1]))
+        M. Hamann P. Harper, P. Martinez
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "LayoutLMForMaskedLM",
+    "LayoutLMForSequenceClassification",
+    "LayoutLMForTokenClassification",
+    "LayoutLMForQuestionAnswering",
+    "LayoutLMModel",
+    "LayoutLMPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/modeling_tf_layoutlm.py b/phivenv/Lib/site-packages/transformers/models/layoutlm/modeling_tf_layoutlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6738693843be0fe9af32cfc4fe96f3e6fbceb59
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -0,0 +1,1691 @@
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 LayoutLM model."""
+
+from __future__ import annotations
+
+import math
+import warnings
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFMaskedLMOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_layoutlm import LayoutLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LayoutLMConfig"
+
+
+class TFLayoutLMEmbeddings(keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.max_2d_position_embeddings = config.max_2d_position_embeddings
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("x_position_embeddings"):
+            self.x_position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_2d_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("y_position_embeddings"):
+            self.y_position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_2d_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("h_position_embeddings"):
+            self.h_position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_2d_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("w_position_embeddings"):
+            self.w_position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_2d_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+        if bbox is None:
+            bbox = tf.fill(input_shape + [4], value=0)
+        try:
+            left_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 0])
+            upper_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 1])
+            right_position_embeddings = tf.gather(self.x_position_embeddings, bbox[:, :, 2])
+            lower_position_embeddings = tf.gather(self.y_position_embeddings, bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The `bbox`coordinate values should be within 0-1000 range.") from e
+        h_position_embeddings = tf.gather(self.h_position_embeddings, bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = tf.gather(self.w_position_embeddings, bbox[:, :, 2] - bbox[:, :, 0])
+
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = (
+            inputs_embeds
+            + position_embeds
+            + token_type_embeds
+            + left_position_embeddings
+            + upper_position_embeddings
+            + right_position_embeddings
+            + lower_position_embeddings
+            + h_position_embeddings
+            + w_position_embeddings
+        )
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->LayoutLM
+class TFLayoutLMSelfAttention(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+        self.is_decoder = config.is_decoder
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFLayoutLMModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->LayoutLM
+class TFLayoutLMSelfOutput(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->LayoutLM
+class TFLayoutLMAttention(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFLayoutLMSelfAttention(config, name="self")
+        self.dense_output = TFLayoutLMSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        # add attentions (possibly with past_key_value) if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->LayoutLM
+class TFLayoutLMIntermediate(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->LayoutLM
+class TFLayoutLMOutput(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->LayoutLM
+class TFLayoutLMLayer(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFLayoutLMAttention(config, name="attention")
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = TFLayoutLMAttention(config, name="crossattention")
+        self.intermediate = TFLayoutLMIntermediate(config, name="intermediate")
+        self.bert_output = TFLayoutLMOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: tuple[tf.Tensor] | None,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=self_attn_past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                input_tensor=attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        intermediate_output = self.intermediate(hidden_states=attention_output)
+        layer_output = self.bert_output(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + outputs  # add attentions if we output them
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->LayoutLM
+class TFLayoutLMEncoder(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer = [TFLayoutLMLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None,
+        use_cache: bool | None,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->LayoutLM
+class TFLayoutLMPooler(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->LayoutLM
+class TFLayoutLMPredictionHeadTransform(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.transform_act_fn = config.hidden_act
+
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(inputs=hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->LayoutLM
+class TFLayoutLMLMPredictionHead(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+
+        self.transform = TFLayoutLMPredictionHeadTransform(config, name="transform")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transform", None) is not None:
+            with tf.name_scope(self.transform.name):
+                self.transform.build(None)
+
+    def get_output_embeddings(self) -> keras.layers.Layer:
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value: tf.Variable):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self) -> dict[str, tf.Variable]:
+        return {"bias": self.bias}
+
+    def set_bias(self, value: tf.Variable):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.transform(hidden_states=hidden_states)
+        seq_length = shape_list(hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->LayoutLM
+class TFLayoutLMMLMHead(keras.layers.Layer):
+    def __init__(self, config: LayoutLMConfig, input_embeddings: keras.layers.Layer, **kwargs):
+        super().__init__(**kwargs)
+
+        self.predictions = TFLayoutLMLMPredictionHead(config, input_embeddings, name="predictions")
+
+    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+        prediction_scores = self.predictions(hidden_states=sequence_output)
+
+        return prediction_scores
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+
+
+@keras_serializable
+class TFLayoutLMMainLayer(keras.layers.Layer):
+    config_class = LayoutLMConfig
+
+    def __init__(self, config: LayoutLMConfig, add_pooling_layer: bool = True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+
+        self.embeddings = TFLayoutLMEmbeddings(config, name="embeddings")
+        self.encoder = TFLayoutLMEncoder(config, name="encoder")
+        self.pooler = TFLayoutLMPooler(config, name="pooler") if add_pooling_layer else None
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings
+
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=input_shape, value=1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+        if bbox is None:
+            bbox = tf.fill(dims=input_shape + [4], value=0)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            bbox=bbox,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            # Need to pass these required positional arguments to `Encoder`
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=False,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+
+
+class TFLayoutLMPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LayoutLMConfig
+    base_model_prefix = "layoutlm"
+
+    @property
+    def input_signature(self):
+        signature = super().input_signature
+        signature["bbox"] = tf.TensorSpec(shape=(None, None, 4), dtype=tf.int32, name="bbox")
+        return signature
+
+
+LAYOUTLM_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`LayoutLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+LAYOUTLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        bbox (`Numpy array` or `tf.Tensor` of shape `({0}, 4)`, *optional*):
+            Bounding Boxes of each input sequence tokens. Selected in the range `[0, config.max_2d_position_embeddings-
+            1]`.
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare LayoutLM Model transformer outputting raw hidden-states without any specific head on top.",
+    LAYOUTLM_START_DOCSTRING,
+)
+class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
+    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(
+        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, TFLayoutLMModel
+        >>> import tensorflow as tf
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = tf.convert_to_tensor([token_boxes])
+
+        >>> outputs = model(
+        ...     input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids
+        ... )
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlm", None) is not None:
+            with tf.name_scope(self.layoutlm.name):
+                self.layoutlm.build(None)
+
+
+@add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
+class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [
+        r"pooler",
+        r"cls.seq_relationship",
+        r"cls.predictions.decoder.weight",
+        r"nsp___cls",
+    ]
+
+    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `TFLayoutLMForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
+        self.mlm = TFLayoutLMMLMHead(config, input_embeddings=self.layoutlm.embeddings, name="mlm___cls")
+
+    def get_lm_head(self) -> keras.layers.Layer:
+        return self.mlm.predictions
+
+    def get_prefix_bias_name(self) -> str:
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, TFLayoutLMForMaskedLM
+        >>> import tensorflow as tf
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMForMaskedLM.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "[MASK]"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = tf.convert_to_tensor([token_boxes])
+
+        >>> labels = tokenizer("Hello world", return_tensors="tf")["input_ids"]
+
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=labels,
+        ... )
+
+        >>> loss = outputs.loss
+        ```"""
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlm", None) is not None:
+            with tf.name_scope(self.layoutlm.name):
+                self.layoutlm.build(None)
+        if getattr(self, "mlm", None) is not None:
+            with tf.name_scope(self.mlm.name):
+                self.mlm.build(None)
+
+
+@add_start_docstrings(
+    """
+    LayoutLM Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    LAYOUTLM_START_DOCSTRING,
+)
+class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+
+        self.layoutlm = TFLayoutLMMainLayer(config, name="layoutlm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, TFLayoutLMForSequenceClassification
+        >>> import tensorflow as tf
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = tf.convert_to_tensor([token_boxes])
+        >>> sequence_label = tf.convert_to_tensor([1])
+
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=sequence_label,
+        ... )
+
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(inputs=pooled_output, training=training)
+        logits = self.classifier(inputs=pooled_output)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlm", None) is not None:
+            with tf.name_scope(self.layoutlm.name):
+                self.layoutlm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    LayoutLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    LAYOUTLM_START_DOCSTRING,
+)
+class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [
+        r"pooler",
+        r"mlm___cls",
+        r"nsp___cls",
+        r"cls.predictions",
+        r"cls.seq_relationship",
+    ]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+
+        self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoTokenizer, TFLayoutLMForTokenClassification
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = tf.convert_to_tensor([token_boxes])
+        >>> token_labels = tf.convert_to_tensor([1, 1, 0, 0])
+
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=token_labels,
+        ... )
+
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(inputs=sequence_output, training=training)
+        logits = self.classifier(inputs=sequence_output)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlm", None) is not None:
+            with tf.name_scope(self.layoutlm.name):
+                self.layoutlm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    LayoutLM Model with a span classification head on top for extractive question-answering tasks such as
+    [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the final hidden-states output to compute `span
+    start logits` and `span end logits`).
+    """,
+    LAYOUTLM_START_DOCSTRING,
+)
+class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnsweringLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [
+        r"pooler",
+        r"mlm___cls",
+        r"nsp___cls",
+        r"cls.predictions",
+        r"cls.seq_relationship",
+    ]
+
+    def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.layoutlm = TFLayoutLMMainLayer(config, add_pooling_layer=True, name="layoutlm")
+        self.qa_outputs = keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="qa_outputs",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoTokenizer, TFLayoutLMForQuestionAnswering
+        >>> from datasets import load_dataset
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
+        >>> model = TFLayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
+
+        >>> dataset = load_dataset("nielsr/funsd", split="train")
+        >>> example = dataset[0]
+        >>> question = "what's his name?"
+        >>> words = example["words"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = tokenizer(
+        ...     question.split(), words, is_split_into_words=True, return_token_type_ids=True, return_tensors="tf"
+        ... )
+        >>> bbox = []
+        >>> for i, s, w in zip(encoding.input_ids[0], encoding.sequence_ids(0), encoding.word_ids(0)):
+        ...     if s == 1:
+        ...         bbox.append(boxes[w])
+        ...     elif i == tokenizer.sep_token_id:
+        ...         bbox.append([1000] * 4)
+        ...     else:
+        ...         bbox.append([0] * 4)
+        >>> encoding["bbox"] = tf.convert_to_tensor([bbox])
+
+        >>> word_ids = encoding.word_ids(0)
+        >>> outputs = model(**encoding)
+        >>> loss = outputs.loss
+        >>> start_scores = outputs.start_logits
+        >>> end_scores = outputs.end_logits
+        >>> start, end = word_ids[tf.math.argmax(start_scores, -1)[0]], word_ids[tf.math.argmax(end_scores, -1)[0]]
+        >>> print(" ".join(words[start : end + 1]))
+        M. Hamann P. Harper, P. Martinez
+        ```"""
+
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(inputs=sequence_output)
+        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
+        start_logits = tf.squeeze(input=start_logits, axis=-1)
+        end_logits = tf.squeeze(input=end_logits, axis=-1)
+        loss = None
+
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlm", None) is not None:
+            with tf.name_scope(self.layoutlm.name):
+                self.layoutlm.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFLayoutLMForMaskedLM",
+    "TFLayoutLMForSequenceClassification",
+    "TFLayoutLMForTokenClassification",
+    "TFLayoutLMForQuestionAnswering",
+    "TFLayoutLMMainLayer",
+    "TFLayoutLMModel",
+    "TFLayoutLMPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/tokenization_layoutlm.py b/phivenv/Lib/site-packages/transformers/models/layoutlm/tokenization_layoutlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4caccd691d0e5fcb64637f72d5c2860f6f096e9e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -0,0 +1,483 @@
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model LayoutLM."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->LayoutLM,BERT->LayoutLM
+class LayoutLMTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a LayoutLM tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original LayoutLM).
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        clean_up_tokenization_spaces=True,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = LayoutLMTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text, split_special_tokens=False):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A LayoutLM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["LayoutLMTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/phivenv/Lib/site-packages/transformers/models/layoutlm/tokenization_layoutlm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7ade6e0b8cdf04f2d4f06b6191b93a8ed7ee2a6
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlm/tokenization_layoutlm_fast.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model LayoutLM."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_layoutlm import LayoutLMTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->LayoutLM,BERT->LayoutLM
+class LayoutLMTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" LayoutLM tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original LayoutLM).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = LayoutLMTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A LayoutLM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["LayoutLMTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__init__.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68a523c0b0c362d9930f5bee492cea73f3937f0
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_layoutlmv2 import *
+    from .feature_extraction_layoutlmv2 import *
+    from .image_processing_layoutlmv2 import *
+    from .image_processing_layoutlmv2_fast import *
+    from .modeling_layoutlmv2 import *
+    from .processing_layoutlmv2 import *
+    from .tokenization_layoutlmv2 import *
+    from .tokenization_layoutlmv2_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dbea556c357c51f1e5aee47dbc2342ff3f36432
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/configuration_layoutlmv2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/configuration_layoutlmv2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb84aa0ec7c0893e88a575f106603218e2288dcd
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/configuration_layoutlmv2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/feature_extraction_layoutlmv2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/feature_extraction_layoutlmv2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59557bbb559f7f69b9271caf6871460b0a4f2e03
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/feature_extraction_layoutlmv2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/image_processing_layoutlmv2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/image_processing_layoutlmv2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48c3cb6bfa12913020e64654ea3aaf95b8de1a40
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/image_processing_layoutlmv2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/image_processing_layoutlmv2_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/image_processing_layoutlmv2_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3752aa04b631eefa86d8266d87c2f1a42a79d74e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/image_processing_layoutlmv2_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/modeling_layoutlmv2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/modeling_layoutlmv2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19ec8e501c29ab388cd4171b3815412ca435f62e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/modeling_layoutlmv2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/processing_layoutlmv2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/processing_layoutlmv2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76d7bb90df94154b3a3fd41d237df74a7c647ead
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/processing_layoutlmv2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/tokenization_layoutlmv2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/tokenization_layoutlmv2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a51d9b0a5f56610a2cd32bcbd616744f947a4a1e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/tokenization_layoutlmv2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/tokenization_layoutlmv2_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/tokenization_layoutlmv2_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aec73237cf7566e6c4723c13ebdc0860203f2133
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/__pycache__/tokenization_layoutlmv2_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/configuration_layoutlmv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b729ddbb1d429f3cfe464f303ec55ae391d498c3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LayoutLMv2 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import is_detectron2_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+# soft dependency
+if is_detectron2_available():
+    import detectron2
+
+
+class LayoutLMv2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LayoutLMv2Model`]. It is used to instantiate an
+    LayoutLMv2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the LayoutLMv2
+    [microsoft/layoutlmv2-base-uncased](https://huggingface.co/microsoft/layoutlmv2-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the LayoutLMv2 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`LayoutLMv2Model`] or [`TFLayoutLMv2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv2Model`] or
+            [`TFLayoutLMv2Model`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum value that the 2D position embedding might ever be used with. Typically set this to something
+            large just in case (e.g., 1024).
+        max_rel_pos (`int`, *optional*, defaults to 128):
+            The maximum number of relative positions to be used in the self-attention mechanism.
+        rel_pos_bins (`int`, *optional*, defaults to 32):
+            The number of relative position bins to be used in the self-attention mechanism.
+        fast_qkv (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a single matrix for the queries, keys, values in the self-attention layers.
+        max_rel_2d_pos (`int`, *optional*, defaults to 256):
+            The maximum number of relative 2D positions in the self-attention mechanism.
+        rel_2d_pos_bins (`int`, *optional*, defaults to 64):
+            The number of 2D relative position bins in the self-attention mechanism.
+        image_feature_pool_shape (`list[int]`, *optional*, defaults to [7, 7, 256]):
+            The shape of the average-pooled feature map.
+        coordinate_size (`int`, *optional*, defaults to 128):
+            Dimension of the coordinate embeddings.
+        shape_size (`int`, *optional*, defaults to 128):
+            Dimension of the width and height embeddings.
+        has_relative_attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a relative attention bias in the self-attention mechanism.
+        has_spatial_attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a spatial attention bias in the self-attention mechanism.
+        has_visual_segment_embedding (`bool`, *optional*, defaults to `False`):
+            Whether or not to add visual segment embeddings.
+        detectron2_config_args (`dict`, *optional*):
+            Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to [this
+            file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutlmv2/detectron2_config.py)
+            for details regarding default values.
+
+    Example:
+
+    ```python
+    >>> from transformers import LayoutLMv2Config, LayoutLMv2Model
+
+    >>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration
+    >>> configuration = LayoutLMv2Config()
+
+    >>> # Initializing a model (with random weights) from the microsoft/layoutlmv2-base-uncased style configuration
+    >>> model = LayoutLMv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "layoutlmv2"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        max_2d_position_embeddings=1024,
+        max_rel_pos=128,
+        rel_pos_bins=32,
+        fast_qkv=True,
+        max_rel_2d_pos=256,
+        rel_2d_pos_bins=64,
+        convert_sync_batchnorm=True,
+        image_feature_pool_shape=[7, 7, 256],
+        coordinate_size=128,
+        shape_size=128,
+        has_relative_attention_bias=True,
+        has_spatial_attention_bias=True,
+        has_visual_segment_embedding=False,
+        detectron2_config_args=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            hidden_dropout_prob=hidden_dropout_prob,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            max_position_embeddings=max_position_embeddings,
+            type_vocab_size=type_vocab_size,
+            initializer_range=initializer_range,
+            layer_norm_eps=layer_norm_eps,
+            pad_token_id=pad_token_id,
+            **kwargs,
+        )
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.max_rel_pos = max_rel_pos
+        self.rel_pos_bins = rel_pos_bins
+        self.fast_qkv = fast_qkv
+        self.max_rel_2d_pos = max_rel_2d_pos
+        self.rel_2d_pos_bins = rel_2d_pos_bins
+        self.convert_sync_batchnorm = convert_sync_batchnorm
+        self.image_feature_pool_shape = image_feature_pool_shape
+        self.coordinate_size = coordinate_size
+        self.shape_size = shape_size
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.has_spatial_attention_bias = has_spatial_attention_bias
+        self.has_visual_segment_embedding = has_visual_segment_embedding
+        self.detectron2_config_args = (
+            detectron2_config_args if detectron2_config_args is not None else self.get_default_detectron2_config()
+        )
+
+    @classmethod
+    def get_default_detectron2_config(cls):
+        return {
+            "MODEL.MASK_ON": True,
+            "MODEL.PIXEL_STD": [57.375, 57.120, 58.395],
+            "MODEL.BACKBONE.NAME": "build_resnet_fpn_backbone",
+            "MODEL.FPN.IN_FEATURES": ["res2", "res3", "res4", "res5"],
+            "MODEL.ANCHOR_GENERATOR.SIZES": [[32], [64], [128], [256], [512]],
+            "MODEL.RPN.IN_FEATURES": ["p2", "p3", "p4", "p5", "p6"],
+            "MODEL.RPN.PRE_NMS_TOPK_TRAIN": 2000,
+            "MODEL.RPN.PRE_NMS_TOPK_TEST": 1000,
+            "MODEL.RPN.POST_NMS_TOPK_TRAIN": 1000,
+            "MODEL.POST_NMS_TOPK_TEST": 1000,
+            "MODEL.ROI_HEADS.NAME": "StandardROIHeads",
+            "MODEL.ROI_HEADS.NUM_CLASSES": 5,
+            "MODEL.ROI_HEADS.IN_FEATURES": ["p2", "p3", "p4", "p5"],
+            "MODEL.ROI_BOX_HEAD.NAME": "FastRCNNConvFCHead",
+            "MODEL.ROI_BOX_HEAD.NUM_FC": 2,
+            "MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION": 14,
+            "MODEL.ROI_MASK_HEAD.NAME": "MaskRCNNConvUpsampleHead",
+            "MODEL.ROI_MASK_HEAD.NUM_CONV": 4,
+            "MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION": 7,
+            "MODEL.RESNETS.DEPTH": 101,
+            "MODEL.RESNETS.SIZES": [[32], [64], [128], [256], [512]],
+            "MODEL.RESNETS.ASPECT_RATIOS": [[0.5, 1.0, 2.0]],
+            "MODEL.RESNETS.OUT_FEATURES": ["res2", "res3", "res4", "res5"],
+            "MODEL.RESNETS.NUM_GROUPS": 32,
+            "MODEL.RESNETS.WIDTH_PER_GROUP": 8,
+            "MODEL.RESNETS.STRIDE_IN_1X1": False,
+        }
+
+    def get_detectron2_config(self):
+        detectron2_config = detectron2.config.get_cfg()
+        for k, v in self.detectron2_config_args.items():
+            attributes = k.split(".")
+            to_set = detectron2_config
+            for attribute in attributes[:-1]:
+                to_set = getattr(to_set, attribute)
+            setattr(to_set, attributes[-1], v)
+
+        return detectron2_config
+
+
+__all__ = ["LayoutLMv2Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c70e1ed643101401373ae29637d4d597873485e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
@@ -0,0 +1,40 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for LayoutLMv2.
+"""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class LayoutLMv2FeatureExtractor(LayoutLMv2ImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class LayoutLMv2FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use LayoutLMv2ImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["LayoutLMv2FeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb61c352dc33ac8a5ef10270384bce5ece77386
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LayoutLMv2."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import flip_channel_order, resize, to_channel_dimension_format, to_pil_image
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_pytesseract_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from ...utils.import_utils import requires
+
+
+if is_vision_available():
+    import PIL
+
+# soft dependency
+if is_pytesseract_available():
+    import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_box(box, width, height):
+    return [
+        int(1000 * (box[0] / width)),
+        int(1000 * (box[1] / height)),
+        int(1000 * (box[2] / width)),
+        int(1000 * (box[3] / height)),
+    ]
+
+
+def apply_tesseract(
+    image: np.ndarray,
+    lang: Optional[str],
+    tesseract_config: Optional[str] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
+    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
+    tesseract_config = tesseract_config if tesseract_config is not None else ""
+
+    # apply OCR
+    pil_image = to_pil_image(image, input_data_format=input_data_format)
+    image_width, image_height = pil_image.size
+    data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
+    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+
+    # filter empty words and corresponding coordinates
+    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
+    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
+    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
+    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
+    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
+    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
+
+    # turn coordinates into (left, top, left+width, top+height) format
+    actual_boxes = []
+    for x, y, w, h in zip(left, top, width, height):
+        actual_box = [x, y, x + w, y + h]
+        actual_boxes.append(actual_box)
+
+    # finally, normalize the bounding boxes
+    normalized_boxes = []
+    for box in actual_boxes:
+        normalized_boxes.append(normalize_box(box, image_width, image_height))
+
+    assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
+
+    return words, normalized_boxes
+
+
+@requires(backends=("vision",))
+class LayoutLMv2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LayoutLMv2 image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
+            overridden by `do_resize` in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        apply_ocr (`bool`, *optional*, defaults to `True`):
+            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+            `apply_ocr` in `preprocess`.
+        ocr_lang (`str`, *optional*):
+            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+            used. Can be overridden by `ocr_lang` in `preprocess`.
+        tesseract_config (`str`, *optional*, defaults to `""`):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        apply_ocr: bool = True,
+        ocr_lang: Optional[str] = None,
+        tesseract_config: Optional[str] = "",
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.apply_ocr = apply_ocr
+        self.ocr_lang = ocr_lang
+        self.tesseract_config = tesseract_config
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        apply_ocr: Optional[bool] = None,
+        ocr_lang: Optional[str] = None,
+        tesseract_config: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Desired size of the output image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image` resampling
+                filter. Only has an effect if `do_resize` is set to `True`.
+            apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`):
+                Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
+            ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`):
+                The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+                used.
+            tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`):
+                Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+                Tesseract.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        apply_ocr = apply_ocr if apply_ocr is not None else self.apply_ocr
+        ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang
+        tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if apply_ocr:
+            requires_backends(self, "pytesseract")
+            words_batch = []
+            boxes_batch = []
+            for image in images:
+                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
+                words_batch.append(words)
+                boxes_batch.append(boxes)
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        # flip color channels from RGB to BGR (as Detectron2 requires this)
+        images = [flip_channel_order(image, input_data_format=input_data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        if apply_ocr:
+            data["words"] = words_batch
+            data["boxes"] = boxes_batch
+        return data
+
+
+__all__ = ["LayoutLMv2ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..c22612da5858bffc76e8d0bc2ea5b4aa3dd245fc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for LayoutLMv2."""
+
+from typing import Optional, Union
+
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
+from ...image_utils import ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+    requires_backends,
+)
+from .image_processing_layoutlmv2 import apply_tesseract
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class LayoutLMv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    Args:
+        apply_ocr (`bool`, *optional*, defaults to `True`):
+            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+            the `apply_ocr` parameter in the `preprocess` method.
+        ocr_lang (`str`, *optional*):
+            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+            `preprocess` method.
+    """
+
+    apply_ocr: Optional[bool]
+    ocr_lang: Optional[str]
+    tesseract_config: Optional[str]
+
+
+@auto_docstring
+class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    size = {"height": 224, "width": 224}
+    rescale_factor = None
+    do_resize = True
+    apply_ocr = True
+    ocr_lang = None
+    tesseract_config = ""
+    valid_kwargs = LayoutLMv2FastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        apply_ocr: bool,
+        ocr_lang: Optional[str],
+        tesseract_config: Optional[str],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Tesseract OCR to get words + normalized bounding boxes
+        if apply_ocr:
+            requires_backends(self, "pytesseract")
+            words_batch = []
+            boxes_batch = []
+            for image in images:
+                if image.is_cuda:
+                    logger.warning_once(
+                        "apply_ocr can only be performed on cpu. Tensors will be transferred to cpu before processing."
+                    )
+                words, boxes = apply_tesseract(
+                    image.cpu(), ocr_lang, tesseract_config, input_data_format=ChannelDimension.FIRST
+                )
+                words_batch.append(words)
+                boxes_batch.append(boxes)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # flip color channels from RGB to BGR (as Detectron2 requires this)
+            stacked_images = stacked_images.flip(1)
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        data = BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+        if apply_ocr:
+            data["words"] = words_batch
+            data["boxes"] = boxes_batch
+
+        return data
+
+
+__all__ = ["LayoutLMv2ImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/modeling_layoutlmv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d8127ef6c2e15f54492659cb8b70f4360576b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -0,0 +1,1395 @@
+# coding=utf-8
+# Copyright 2021 Microsoft Research The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LayoutLMv2 model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward
+from ...utils import auto_docstring, is_detectron2_available, logging, requires_backends
+from .configuration_layoutlmv2 import LayoutLMv2Config
+
+
+# soft dependency
+if is_detectron2_available():
+    import detectron2
+    from detectron2.modeling import META_ARCH_REGISTRY
+
+    # This is needed as otherwise their overload will break sequential loading by overwriting buffer over and over. See
+    # https://github.com/facebookresearch/detectron2/blob/9604f5995cc628619f0e4fd913453b4d7d61db3f/detectron2/layers/batch_norm.py#L83-L86
+    detectron2.layers.batch_norm.FrozenBatchNorm2d._load_from_state_dict = torch.nn.Module._load_from_state_dict
+
+logger = logging.get_logger(__name__)
+
+
+class LayoutLMv2Embeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def _calc_spatial_position_embeddings(self, bbox):
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e
+
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+
+        spatial_position_embeddings = torch.cat(
+            [
+                left_position_embeddings,
+                upper_position_embeddings,
+                right_position_embeddings,
+                lower_position_embeddings,
+                h_position_embeddings,
+                w_position_embeddings,
+            ],
+            dim=-1,
+        )
+        return spatial_position_embeddings
+
+
+class LayoutLMv2SelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.fast_qkv = config.fast_qkv
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if config.fast_qkv:
+            self.qkv_linear = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=False)
+            self.q_bias = nn.Parameter(torch.zeros(1, 1, self.all_head_size))
+            self.v_bias = nn.Parameter(torch.zeros(1, 1, self.all_head_size))
+        else:
+            self.query = nn.Linear(config.hidden_size, self.all_head_size)
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def compute_qkv(self, hidden_states):
+        if self.fast_qkv:
+            qkv = self.qkv_linear(hidden_states)
+            q, k, v = torch.chunk(qkv, 3, dim=-1)
+            if q.ndimension() == self.q_bias.ndimension():
+                q = q + self.q_bias
+                v = v + self.v_bias
+            else:
+                _sz = (1,) * (q.ndimension() - 1) + (-1,)
+                q = q + self.q_bias.view(*_sz)
+                v = v + self.v_bias.view(*_sz)
+        else:
+            q = self.query(hidden_states)
+            k = self.key(hidden_states)
+            v = self.value(hidden_states)
+        return q, k, v
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        batch_size, seq_length, _ = hidden_states.shape
+        query, key, value = self.compute_qkv(hidden_states)
+
+        # (B, L, H*D) -> (B, H, L, D)
+        query_layer = query.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        key_layer = key.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        value_layer = value.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+        # [BSZ, NAT, L, L]
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.has_relative_attention_bias:
+            attention_scores += rel_pos
+        if self.has_spatial_attention_bias:
+            attention_scores += rel_2d_pos
+        attention_scores = attention_scores.float().masked_fill_(
+            attention_mask.to(torch.bool), torch.finfo(attention_scores.dtype).min
+        )
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32).type_as(value_layer)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+
+
+class LayoutLMv2Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = LayoutLMv2SelfAttention(config)
+        self.output = LayoutLMv2SelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class LayoutLMv2SelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->LayoutLMv2
+class LayoutLMv2Intermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->LayoutLM
+class LayoutLMv2Output(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LayoutLMv2Layer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = LayoutLMv2Attention(config)
+        self.intermediate = LayoutLMv2Intermediate(config)
+        self.output = LayoutLMv2Output(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+def relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+    """
+    Adapted from Mesh Tensorflow:
+    https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+    Translate relative position to a bucket number for relative attention. The relative position is defined as
+    memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+    position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small
+    absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions
+    >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should
+    allow for more graceful generalization to longer sequences than the model has been trained on.
+
+    Args:
+        relative_position: an int32 Tensor
+        bidirectional: a boolean - whether the attention is bidirectional
+        num_buckets: an integer
+        max_distance: an integer
+
+    Returns:
+        a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+    """
+
+    ret = 0
+    if bidirectional:
+        num_buckets //= 2
+        ret += (relative_position > 0).long() * num_buckets
+        n = torch.abs(relative_position)
+    else:
+        n = torch.max(-relative_position, torch.zeros_like(relative_position))
+    # now n is in the range [0, inf)
+
+    # half of the buckets are for exact increments in positions
+    max_exact = num_buckets // 2
+    is_small = n < max_exact
+
+    # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+    val_if_large = max_exact + (
+        torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+    ).to(torch.long)
+    val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+    ret += torch.where(is_small, n, val_if_large)
+    return ret
+
+
+class LayoutLMv2Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([LayoutLMv2Layer(config) for _ in range(config.num_hidden_layers)])
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if self.has_relative_attention_bias:
+            self.rel_pos_bins = config.rel_pos_bins
+            self.max_rel_pos = config.max_rel_pos
+            self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)
+
+        if self.has_spatial_attention_bias:
+            self.max_rel_2d_pos = config.max_rel_2d_pos
+            self.rel_2d_pos_bins = config.rel_2d_pos_bins
+            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
+            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
+
+        self.gradient_checkpointing = False
+
+    def _calculate_1d_position_embeddings(self, position_ids):
+        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
+        rel_pos = relative_position_bucket(
+            rel_pos_mat,
+            num_buckets=self.rel_pos_bins,
+            max_distance=self.max_rel_pos,
+        )
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
+        rel_pos = rel_pos.contiguous()
+        return rel_pos
+
+    def _calculate_2d_position_embeddings(self, bbox):
+        position_coord_x = bbox[:, :, 0]
+        position_coord_y = bbox[:, :, 3]
+        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
+        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
+        rel_pos_x = relative_position_bucket(
+            rel_pos_x_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_y = relative_position_bucket(
+            rel_pos_y_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
+            rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
+        rel_pos_x = rel_pos_x.contiguous()
+        rel_pos_y = rel_pos_y.contiguous()
+        rel_2d_pos = rel_pos_x + rel_pos_y
+        return rel_2d_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        bbox=None,
+        position_ids=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        rel_pos = self._calculate_1d_position_embeddings(position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._calculate_2d_position_embeddings(bbox) if self.has_spatial_attention_bias else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                output_attentions,
+                rel_pos=rel_pos,
+                rel_2d_pos=rel_2d_pos,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@auto_docstring
+class LayoutLMv2PreTrainedModel(PreTrainedModel):
+    config: LayoutLMv2Config
+    base_model_prefix = "layoutlmv2"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, LayoutLMv2SelfAttention):
+            if self.config.fast_qkv:
+                module.q_bias.data.zero_()
+                module.v_bias.data.zero_()
+        elif isinstance(module, LayoutLMv2Model):
+            if hasattr(module, "visual_segment_embedding"):
+                module.visual_segment_embedding.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+def my_convert_sync_batchnorm(module, process_group=None):
+    # same as `nn.modules.SyncBatchNorm.convert_sync_batchnorm` but allowing converting from `detectron2.layers.FrozenBatchNorm2d`
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+        return nn.modules.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
+    module_output = module
+    if isinstance(module, detectron2.layers.FrozenBatchNorm2d):
+        module_output = torch.nn.SyncBatchNorm(
+            num_features=module.num_features,
+            eps=module.eps,
+            affine=True,
+            track_running_stats=True,
+            process_group=process_group,
+        )
+        module_output.weight = torch.nn.Parameter(module.weight)
+        module_output.bias = torch.nn.Parameter(module.bias)
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = torch.tensor(0, dtype=torch.long, device=module.running_mean.device)
+    for name, child in module.named_children():
+        module_output.add_module(name, my_convert_sync_batchnorm(child, process_group))
+    del module
+    return module_output
+
+
+class LayoutLMv2VisualBackbone(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.cfg = config.get_detectron2_config()
+        meta_arch = self.cfg.MODEL.META_ARCHITECTURE
+        model = META_ARCH_REGISTRY.get(meta_arch)(self.cfg)
+        assert isinstance(model.backbone, detectron2.modeling.backbone.FPN)
+        self.backbone = model.backbone
+
+        assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
+        num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
+        self.register_buffer(
+            "pixel_mean",
+            torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1),
+            persistent=False,
+        )
+        self.register_buffer(
+            "pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False
+        )
+        self.out_feature_key = "p2"
+        if torch.are_deterministic_algorithms_enabled():
+            logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`")
+            input_shape = (224, 224)
+            backbone_stride = self.backbone.output_shape()[self.out_feature_key].stride
+            self.pool = nn.AvgPool2d(
+                (
+                    math.ceil(math.ceil(input_shape[0] / backbone_stride) / config.image_feature_pool_shape[0]),
+                    math.ceil(math.ceil(input_shape[1] / backbone_stride) / config.image_feature_pool_shape[1]),
+                )
+            )
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(config.image_feature_pool_shape[:2])
+        if len(config.image_feature_pool_shape) == 2:
+            config.image_feature_pool_shape.append(self.backbone.output_shape()[self.out_feature_key].channels)
+        assert self.backbone.output_shape()[self.out_feature_key].channels == config.image_feature_pool_shape[2]
+
+    def forward(self, images):
+        images_input = ((images if torch.is_tensor(images) else images.tensor) - self.pixel_mean) / self.pixel_std
+        features = self.backbone(images_input)
+        features = features[self.out_feature_key]
+        features = self.pool(features).flatten(start_dim=2).transpose(1, 2).contiguous()
+        return features
+
+    def synchronize_batch_norm(self):
+        if not (
+            torch.distributed.is_available()
+            and torch.distributed.is_initialized()
+            and torch.distributed.get_rank() > -1
+        ):
+            raise RuntimeError("Make sure torch.distributed is set up properly.")
+
+        self_rank = torch.distributed.get_rank()
+        node_size = torch.cuda.device_count()
+        world_size = torch.distributed.get_world_size()
+        if not (world_size % node_size == 0):
+            raise RuntimeError("Make sure the number of processes can be divided by the number of nodes")
+
+        node_global_ranks = [list(range(i * node_size, (i + 1) * node_size)) for i in range(world_size // node_size)]
+        sync_bn_groups = [
+            torch.distributed.new_group(ranks=node_global_ranks[i]) for i in range(world_size // node_size)
+        ]
+        node_rank = self_rank // node_size
+
+        self.backbone = my_convert_sync_batchnorm(self.backbone, process_group=sync_bn_groups[node_rank])
+
+
+class LayoutLMv2Pooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
+    def __init__(self, config):
+        requires_backends(self, "detectron2")
+        super().__init__(config)
+        self.config = config
+        self.has_visual_segment_embedding = config.has_visual_segment_embedding
+        self.embeddings = LayoutLMv2Embeddings(config)
+
+        self.visual = LayoutLMv2VisualBackbone(config)
+        self.visual_proj = nn.Linear(config.image_feature_pool_shape[-1], config.hidden_size)
+        if self.has_visual_segment_embedding:
+            self.visual_segment_embedding = nn.Parameter(nn.Embedding(1, config.hidden_size).weight[0])
+        self.visual_LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.visual_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = LayoutLMv2Encoder(config)
+        self.pooler = LayoutLMv2Pooler(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _calc_text_embeddings(self, input_ids, bbox, position_ids, token_type_ids, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings.word_embeddings(input_ids)
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        spatial_position_embeddings = self.embeddings._calc_spatial_position_embeddings(bbox)
+        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + spatial_position_embeddings + token_type_embeddings
+        embeddings = self.embeddings.LayerNorm(embeddings)
+        embeddings = self.embeddings.dropout(embeddings)
+        return embeddings
+
+    def _calc_img_embeddings(self, image, bbox, position_ids):
+        visual_embeddings = self.visual_proj(self.visual(image))
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        spatial_position_embeddings = self.embeddings._calc_spatial_position_embeddings(bbox)
+        embeddings = visual_embeddings + position_embeddings + spatial_position_embeddings
+        if self.has_visual_segment_embedding:
+            embeddings += self.visual_segment_embedding
+        embeddings = self.visual_LayerNorm(embeddings)
+        embeddings = self.visual_dropout(embeddings)
+        return embeddings
+
+    def _calc_visual_bbox(self, image_feature_pool_shape, bbox, device, final_shape):
+        visual_bbox_x = torch.div(
+            torch.arange(
+                0,
+                1000 * (image_feature_pool_shape[1] + 1),
+                1000,
+                device=device,
+                dtype=bbox.dtype,
+            ),
+            self.config.image_feature_pool_shape[1],
+            rounding_mode="floor",
+        )
+        visual_bbox_y = torch.div(
+            torch.arange(
+                0,
+                1000 * (self.config.image_feature_pool_shape[0] + 1),
+                1000,
+                device=device,
+                dtype=bbox.dtype,
+            ),
+            self.config.image_feature_pool_shape[0],
+            rounding_mode="floor",
+        )
+        visual_bbox = torch.stack(
+            [
+                visual_bbox_x[:-1].repeat(image_feature_pool_shape[0], 1),
+                visual_bbox_y[:-1].repeat(image_feature_pool_shape[1], 1).transpose(0, 1),
+                visual_bbox_x[1:].repeat(image_feature_pool_shape[0], 1),
+                visual_bbox_y[1:].repeat(image_feature_pool_shape[1], 1).transpose(0, 1),
+            ],
+            dim=-1,
+        ).view(-1, bbox.size(-1))
+
+        visual_bbox = visual_bbox.repeat(final_shape[0], 1, 1)
+
+        return visual_bbox
+
+    def _get_input_shape(self, input_ids=None, inputs_embeds=None):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            return input_ids.size()
+        elif inputs_embeds is not None:
+            return inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        image: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        bbox (`torch.LongTensor` of shape `((batch_size, sequence_length), 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+        image (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `detectron.structures.ImageList` whose `tensors` is of shape `(batch_size, num_channels, height, width)`):
+            Batch of document images.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, LayoutLMv2Model, set_seed
+        >>> from PIL import Image
+        >>> import torch
+        >>> from datasets import load_dataset
+
+        >>> set_seed(0)
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> image = dataset["test"][0]["image"]
+
+        >>> encoding = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**encoding)
+        >>> last_hidden_states = outputs.last_hidden_state
+
+        >>> last_hidden_states.shape
+        torch.Size([1, 342, 768])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = self._get_input_shape(input_ids, inputs_embeds)
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        visual_shape = list(input_shape)
+        visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1]
+        visual_shape = torch.Size(visual_shape)
+        # needs a new copy of input_shape for tracing. Otherwise wrong dimensions will occur
+        final_shape = list(self._get_input_shape(input_ids, inputs_embeds))
+        final_shape[1] += visual_shape[1]
+        final_shape = torch.Size(final_shape)
+
+        visual_bbox = self._calc_visual_bbox(self.config.image_feature_pool_shape, bbox, device, final_shape)
+        final_bbox = torch.cat([bbox, visual_bbox], dim=1)
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+
+        visual_attention_mask = torch.ones(visual_shape, device=device)
+        final_attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        if position_ids is None:
+            seq_length = input_shape[1]
+            position_ids = self.embeddings.position_ids[:, :seq_length]
+            position_ids = position_ids.expand(input_shape)
+
+        visual_position_ids = torch.arange(0, visual_shape[1], dtype=torch.long, device=device).repeat(
+            input_shape[0], 1
+        )
+        final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1)
+
+        if bbox is None:
+            bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
+
+        text_layout_emb = self._calc_text_embeddings(
+            input_ids=input_ids,
+            bbox=bbox,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+        )
+
+        visual_emb = self._calc_img_embeddings(
+            image=image,
+            bbox=visual_bbox,
+            position_ids=visual_position_ids,
+        )
+        final_emb = torch.cat([text_layout_emb, visual_emb], dim=1)
+
+        extended_attention_mask = final_attention_mask.unsqueeze(1).unsqueeze(2)
+
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
+
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            final_emb,
+            extended_attention_mask,
+            bbox=final_bbox,
+            position_ids=final_position_ids,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    LayoutLMv2 Model with a sequence classification head on top (a linear layer on top of the concatenation of the
+    final hidden state of the [CLS] token, average-pooled initial visual embeddings and average-pooled final visual
+    embeddings, e.g. for document image classification tasks such as the
+    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
+    """
+)
+class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.layoutlmv2 = LayoutLMv2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.layoutlmv2.embeddings.word_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        image: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `batch_size, sequence_length`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+        image (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `detectron.structures.ImageList` whose `tensors` is of shape `(batch_size, num_channels, height, width)`):
+            Batch of document images.
+        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, LayoutLMv2ForSequenceClassification, set_seed
+        >>> from PIL import Image
+        >>> import torch
+        >>> from datasets import load_dataset
+
+        >>> set_seed(0)
+
+        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True)
+        >>> data = next(iter(dataset))
+        >>> image = data["image"].convert("RGB")
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> model = LayoutLMv2ForSequenceClassification.from_pretrained(
+        ...     "microsoft/layoutlmv2-base-uncased", num_labels=dataset.info.features["label"].num_classes
+        ... )
+
+        >>> encoding = processor(image, return_tensors="pt")
+        >>> sequence_label = torch.tensor([data["label"]])
+
+        >>> outputs = model(**encoding, labels=sequence_label)
+
+        >>> loss, logits = outputs.loss, outputs.logits
+        >>> predicted_idx = logits.argmax(dim=-1).item()
+        >>> predicted_answer = dataset.info.features["label"].names[4]
+        >>> predicted_idx, predicted_answer  # results are not good without further fine-tuning
+        (7, 'advertisement')
+        ```
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        visual_shape = list(input_shape)
+        visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1]
+        visual_shape = torch.Size(visual_shape)
+        final_shape = list(input_shape)
+        final_shape[1] += visual_shape[1]
+        final_shape = torch.Size(final_shape)
+
+        visual_bbox = self.layoutlmv2._calc_visual_bbox(
+            self.config.image_feature_pool_shape, bbox, device, final_shape
+        )
+
+        visual_position_ids = torch.arange(0, visual_shape[1], dtype=torch.long, device=device).repeat(
+            input_shape[0], 1
+        )
+
+        initial_image_embeddings = self.layoutlmv2._calc_img_embeddings(
+            image=image,
+            bbox=visual_bbox,
+            position_ids=visual_position_ids,
+        )
+
+        outputs = self.layoutlmv2(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+        sequence_output, final_image_embeddings = outputs[0][:, :seq_length], outputs[0][:, seq_length:]
+
+        cls_final_output = sequence_output[:, 0, :]
+
+        # average-pool the visual embeddings
+        pooled_initial_image_embeddings = initial_image_embeddings.mean(dim=1)
+        pooled_final_image_embeddings = final_image_embeddings.mean(dim=1)
+        # concatenate with cls_final_output
+        sequence_output = torch.cat(
+            [cls_final_output, pooled_initial_image_embeddings, pooled_final_image_embeddings], dim=1
+        )
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    LayoutLMv2 Model with a token classification head on top (a linear layer on top of the text part of the hidden
+    states) e.g. for sequence labeling (information extraction) tasks such as
+    [FUNSD](https://guillaumejaume.github.io/FUNSD/), [SROIE](https://rrc.cvc.uab.es/?ch=13),
+    [CORD](https://github.com/clovaai/cord) and [Kleister-NDA](https://github.com/applicaai/kleister-nda).
+    """
+)
+class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.layoutlmv2 = LayoutLMv2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.layoutlmv2.embeddings.word_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        image: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `batch_size, sequence_length`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+        image (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `detectron.structures.ImageList` whose `tensors` is of shape `(batch_size, num_channels, height, width)`):
+            Batch of document images.
+        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, LayoutLMv2ForTokenClassification, set_seed
+        >>> from PIL import Image
+        >>> from datasets import load_dataset
+
+        >>> set_seed(0)
+
+        >>> datasets = load_dataset("nielsr/funsd", split="test")
+        >>> labels = datasets.features["ner_tags"].feature.names
+        >>> id2label = {v: k for v, k in enumerate(labels)}
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+        >>> model = LayoutLMv2ForTokenClassification.from_pretrained(
+        ...     "microsoft/layoutlmv2-base-uncased", num_labels=len(labels)
+        ... )
+
+        >>> data = datasets[0]
+        >>> image = Image.open(data["image_path"]).convert("RGB")
+        >>> words = data["words"]
+        >>> boxes = data["bboxes"]  # make sure to normalize your bounding boxes
+        >>> word_labels = data["ner_tags"]
+        >>> encoding = processor(
+        ...     image,
+        ...     words,
+        ...     boxes=boxes,
+        ...     word_labels=word_labels,
+        ...     padding="max_length",
+        ...     truncation=True,
+        ...     return_tensors="pt",
+        ... )
+
+        >>> outputs = model(**encoding)
+        >>> logits, loss = outputs.logits, outputs.loss
+
+        >>> predicted_token_class_ids = logits.argmax(-1)
+        >>> predicted_tokens_classes = [id2label[t.item()] for t in predicted_token_class_ids[0]]
+        >>> predicted_tokens_classes[:5]  # results are not good without further fine-tuning
+        ['I-HEADER', 'I-HEADER', 'I-QUESTION', 'I-HEADER', 'I-QUESTION']
+        ```
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv2(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+        # only take the text part of the output representations
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
+    def __init__(self, config, has_visual_segment_embedding=True):
+        r"""
+        has_visual_segment_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to add visual segment embeddings.
+        """
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        config.has_visual_segment_embedding = has_visual_segment_embedding
+        self.layoutlmv2 = LayoutLMv2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.layoutlmv2.embeddings.word_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        image: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `batch_size, sequence_length`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+        image (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `detectron.structures.ImageList` whose `tensors` is of shape `(batch_size, num_channels, height, width)`):
+            Batch of document images.
+        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+
+        Example:
+
+        In this example below, we give the LayoutLMv2 model an image (of texts) and ask it a question. It will give us
+        a prediction of what it thinks the answer is (the span of the answer within the texts parsed from the image).
+
+        ```python
+        >>> from transformers import AutoProcessor, LayoutLMv2ForQuestionAnswering, set_seed
+        >>> import torch
+        >>> from PIL import Image
+        >>> from datasets import load_dataset
+
+        >>> set_seed(0)
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> image = dataset["test"][0]["image"]
+        >>> question = "When is coffee break?"
+        >>> encoding = processor(image, question, return_tensors="pt")
+
+        >>> outputs = model(**encoding)
+        >>> predicted_start_idx = outputs.start_logits.argmax(-1).item()
+        >>> predicted_end_idx = outputs.end_logits.argmax(-1).item()
+        >>> predicted_start_idx, predicted_end_idx
+        (30, 191)
+
+        >>> predicted_answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
+        >>> predicted_answer = processor.tokenizer.decode(predicted_answer_tokens)
+        >>> predicted_answer  # results are not good without further fine-tuning
+        '44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president “ introductory remarks ” lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from'
+        ```
+
+        ```python
+        >>> target_start_index = torch.tensor([7])
+        >>> target_end_index = torch.tensor([14])
+        >>> outputs = model(**encoding, start_positions=target_start_index, end_positions=target_end_index)
+        >>> predicted_answer_span_start = outputs.start_logits.argmax(-1).item()
+        >>> predicted_answer_span_end = outputs.end_logits.argmax(-1).item()
+        >>> predicted_answer_span_start, predicted_answer_span_end
+        (30, 191)
+        ```
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv2(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+        # only take the text part of the output representations
+        sequence_output = outputs[0][:, :seq_length]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "LayoutLMv2ForQuestionAnswering",
+    "LayoutLMv2ForSequenceClassification",
+    "LayoutLMv2ForTokenClassification",
+    "LayoutLMv2Layer",
+    "LayoutLMv2Model",
+    "LayoutLMv2PreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/processing_layoutlmv2.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/processing_layoutlmv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..27fc874f7293a872b63d100f604a26c06caa41eb
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -0,0 +1,190 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for LayoutLMv2.
+"""
+
+import warnings
+from typing import Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class LayoutLMv2Processor(ProcessorMixin):
+    r"""
+    Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
+    single processor.
+
+    [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
+
+    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
+    [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
+    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
+    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
+
+    Args:
+        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
+            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
+            An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv2ImageProcessor"
+    tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
+        [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
+        together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
+        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
+        arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        # verify input
+        if self.image_processor.apply_ocr and (boxes is not None):
+            raise ValueError(
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if self.image_processor.apply_ocr and (word_labels is not None):
+            raise ValueError(
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if return_overflowing_tokens is True and return_offsets_mapping is False:
+            raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
+
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)
+
+        # second, apply the tokenizer
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
+            if isinstance(text, str):
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
+            text_pair = features["words"]
+
+        encoded_inputs = self.tokenizer(
+            text=text if text is not None else features["words"],
+            text_pair=text_pair if text_pair is not None else None,
+            boxes=boxes if boxes is not None else features["boxes"],
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        # add pixel values
+        images = features.pop("pixel_values")
+        if return_overflowing_tokens is True:
+            images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
+        encoded_inputs["image"] = images
+
+        return encoded_inputs
+
+    def get_overflowing_images(self, images, overflow_to_sample_mapping):
+        # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
+        images_with_overflow = []
+        for sample_idx in overflow_to_sample_mapping:
+            images_with_overflow.append(images[sample_idx])
+
+        if len(images_with_overflow) != len(overflow_to_sample_mapping):
+            raise ValueError(
+                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
+                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
+            )
+
+        return images_with_overflow
+
+    @property
+    def model_input_names(self):
+        return ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["LayoutLMv2Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d82b5cf41041b0024134c0f1a6294c0cace824c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -0,0 +1,1545 @@
+# coding=utf-8
+# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for LayoutLMv2."""
+
+import collections
+import os
+import sys
+import unicodedata
+from typing import Optional, Union
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PreTokenizedInput,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+
+              [What are input IDs?](../glossary#input-ids)
+
+            - **bbox** -- List of bounding boxes to be fed to a model.
+
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
+              if *"token_type_ids"* is in `self.model_input_names`).
+
+              [What are token type IDs?](../glossary#token-type-ids)
+
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+
+              [What are attention masks?](../glossary#attention-mask)
+
+            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`).
+"""
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))
+
+
+def subfinder(mylist, pattern):
+    matches = []
+    indices = []
+    for idx, i in enumerate(range(len(mylist))):
+        if mylist[i] == pattern[0] and mylist[i : i + len(pattern)] == pattern:
+            matches.append(pattern)
+            indices.append(idx)
+    if matches:
+        return matches[0], indices[0]
+    else:
+        return None, 0
+
+
+class LayoutLMv2Tokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a LayoutLMv2 tokenizer. Based on WordPiece. [`LayoutLMv2Tokenizer`] can be used to turn words, word-level
+    bounding boxes and optional word labels to token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`, and
+    optional `labels` (for token classification).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    [`LayoutLMv2Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It also turns the
+    word-level bounding boxes into token-level bounding boxes.
+
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[1000, 1000, 1000, 1000],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        model_max_length: int = 512,
+        additional_special_tokens: Optional[list[str]] = None,
+        **kwargs,
+    ):
+        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
+        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
+        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
+        mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            model_max_length=model_max_length,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        batch_outputs = self._batch_prepare_for_model(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        batch_text_or_text_pairs,
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens.
+
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+        """
+
+        batch_outputs = {}
+        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
+            batch_text_or_text_pair, boxes_example = example
+            outputs = self.prepare_for_model(
+                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
+                batch_text_or_text_pair[1] if is_pair else None,
+                boxes_example,
+                word_labels=word_labels[idx] if word_labels is not None else None,
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
+                return_attention_mask=False,  # we pad in batch afterward
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING)
+    def encode(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> list[int]:
+        encoded_inputs = self.encode_plus(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
+        `__call__` should be used instead.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            boxes=boxes,
+            text_pair=text_pair,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        return self.prepare_for_model(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def prepare_for_model(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
+        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
+        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than `None` and
+        *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a
+        combination of arguments will raise an error.
+
+        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
+        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
+        labeled with -100, such that they will be ignored by the loss function.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        tokens = []
+        pair_tokens = []
+        token_boxes = []
+        pair_token_boxes = []
+        labels = []
+
+        if text_pair is None:
+            if word_labels is None:
+                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
+                for word, box in zip(text, boxes):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+            else:
+                # CASE 2: token classification (training)
+                for word, box, label in zip(text, boxes, word_labels):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+                    if self.only_label_first_subword:
+                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
+                    else:
+                        labels.extend([label] * len(word_tokens))
+        else:
+            # CASE 3: document visual question answering (inference)
+            # text = question
+            # text_pair = words
+            tokens = self.tokenize(text)
+            token_boxes = [self.pad_token_box for _ in range(len(tokens))]
+
+            for word, box in zip(text_pair, boxes):
+                if len(word) < 1:  # skip empty words
+                    continue
+                word_tokens = self.tokenize(word)
+                pair_tokens.extend(word_tokens)
+                pair_token_boxes.extend([box] * len(word_tokens))
+
+        # Create ids + pair_ids
+        ids = self.convert_tokens_to_ids(tokens)
+        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
+
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+
+        # Compute the total size of the returned encodings
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            (
+                ids,
+                token_boxes,
+                pair_ids,
+                pair_token_boxes,
+                labels,
+                overflowing_tokens,
+                overflowing_token_boxes,
+                overflowing_labels,
+            ) = self.truncate_sequences(
+                ids,
+                token_boxes,
+                pair_ids=pair_ids,
+                pair_token_boxes=pair_token_boxes,
+                labels=labels,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = {}
+
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
+            encoded_inputs["overflowing_labels"] = overflowing_labels
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+            token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box]
+            if pair_token_boxes:
+                pair_token_boxes = pair_token_boxes + [self.sep_token_box]
+            if labels:
+                labels = [self.pad_token_label] + labels + [self.pad_token_label]
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["bbox"] = token_boxes + pair_token_boxes
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        if labels:
+            encoded_inputs["labels"] = labels
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+
+        return batch_outputs
+
+    def truncate_sequences(
+        self,
+        ids: list[int],
+        token_boxes: list[list[int]],
+        pair_ids: Optional[list[int]] = None,
+        pair_token_boxes: Optional[list[list[int]]] = None,
+        labels: Optional[list[int]] = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
+        stride: int = 0,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """
+        Truncates a sequence pair in-place following the strategy.
+
+        Args:
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_tokens_to_ids` methods.
+            token_boxes (`List[List[int]]`):
+                Bounding boxes of the first sequence.
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_token_boxes (`List[List[int]]`, *optional*):
+                Bounding boxes of the second sequence.
+            labels (`List[int]`, *optional*):
+                Labels of the first sequence (for token classification tasks).
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
+                Number of tokens to remove using the truncation strategy.
+            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                The strategy to follow for truncation. Can be:
+
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
+                  batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
+                  than the model maximum admissible input size).
+            stride (`int`, *optional*, defaults to 0):
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
+                sequence returned. The value of this argument defines the number of additional tokens.
+
+        Returns:
+            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
+            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
+            of sequences (or a batch of pairs) is provided.
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
+
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
+            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
+        ):
+            if len(ids) > num_tokens_to_remove:
+                window_len = min(len(ids), stride + num_tokens_to_remove)
+                overflowing_tokens = ids[-window_len:]
+                overflowing_token_boxes = token_boxes[-window_len:]
+                overflowing_labels = labels[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+                token_boxes = token_boxes[:-num_tokens_to_remove]
+                labels = labels[:-num_tokens_to_remove]
+            else:
+                error_msg = (
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the first sequence has a length {len(ids)}. "
+                )
+                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
+                    error_msg = (
+                        error_msg + "Please select another truncation strategy than "
+                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
+                    )
+                logger.error(error_msg)
+        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            logger.warning(
+                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
+                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
+                "truncation strategy. So the returned list will always be empty even if some "
+                "tokens have been removed."
+            )
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    ids = ids[:-1]
+                    token_boxes = token_boxes[:-1]
+                    labels = labels[:-1]
+                else:
+                    pair_ids = pair_ids[:-1]
+                    pair_token_boxes = pair_token_boxes[:-1]
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
+            if len(pair_ids) > num_tokens_to_remove:
+                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+                overflowing_tokens = pair_ids[-window_len:]
+                overflowing_token_boxes = pair_token_boxes[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the second sequence has a length {len(pair_ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    "for instance 'longest_first' or 'only_first'."
+                )
+
+        return (
+            ids,
+            token_boxes,
+            pair_ids,
+            pair_token_boxes,
+            labels,
+            overflowing_tokens,
+            overflowing_token_boxes,
+            overflowing_labels,
+        )
+
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["LayoutLMv2Tokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e324ee0b8fe971b06cd102eaa9930990a1f5b99
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -0,0 +1,789 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fast tokenization class for LayoutLMv2. It overwrites 2 methods of the slow tokenizer class, namely _batch_encode_plus
+and _encode_plus, in which the Rust tokenizer is used.
+"""
+
+import json
+from typing import Optional, Union
+
+from tokenizers import normalizers
+
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TensorType,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import add_end_docstrings, logging
+from .tokenization_layoutlmv2 import (
+    LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING,
+    LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
+    LayoutLMv2Tokenizer,
+)
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" LayoutLMv2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original LayoutLMv2).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = LayoutLMv2Tokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[1000, 1000, 1000, 1000],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
+            or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
+        ):
+            pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
+            pre_tok_state["lowercase"] = do_lower_case
+            pre_tok_state["strip_accents"] = strip_accents
+            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
+
+        self.do_lower_case = do_lower_case
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
+        )
+
+        return encodings[0].tokens
+
+    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
+        `__call__` should be used instead.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            boxes=boxes,
+            text_pair=text_pair,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
+
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+        )
+
+        if is_pair:
+            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
+        )
+
+        # Convert encoding to dict
+        # `Tokens` has type: Tuple[
+        #                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
+        #                       List[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
+                return_length=return_length,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.cls_token_id:
+                        token_boxes_example.append(self.cls_token_box)
+                    elif id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        raise ValueError("Id not recognized")
+            token_boxes.append(token_boxes_example)
+
+        sanitized_tokens["bbox"] = token_boxes
+
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
+            )
+
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
+
+        return batched_output
+
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["LayoutLMv2TokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__init__.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c776b676f603f9c2d6a2938181e55980e4941048
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_layoutlmv3 import *
+    from .feature_extraction_layoutlmv3 import *
+    from .image_processing_layoutlmv3 import *
+    from .image_processing_layoutlmv3_fast import *
+    from .modeling_layoutlmv3 import *
+    from .modeling_tf_layoutlmv3 import *
+    from .processing_layoutlmv3 import *
+    from .tokenization_layoutlmv3 import *
+    from .tokenization_layoutlmv3_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21715e584840d1afa3fe23f912ea28588afd4aa7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..999db6176eb5cd3af01ca7117c7f3a7874f0dbfb
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9970f6db4cf7d1b582802ac3d7f043e53c3b2bb
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a33eb4ca5bf33949cf58f8a48f14967442e24947
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..015d141ddf286d00eeddc9abce1f23a031e8d31e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32954da2875c84f621314785f311133695af3be7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bf8533ab8a528445799bddeab7abd58e9f6415a
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64058be24860daed9f2625ab3eaa8deb53260800
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2cd9e88260b5a6b5b0ec0785944c31b302930a42
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c34eeb4be7dbeeabae77a4b0ab750b1a00507f7
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c845bb43b346a2d00a93193125f8b3d6d72c1ebc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -0,0 +1,294 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LayoutLMv3 model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...onnx.utils import compute_effective_axis_dimension
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+
+logger = logging.get_logger(__name__)
+
+
+class LayoutLMv3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LayoutLMv3Model`]. It is used to instantiate an
+    LayoutLMv3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the LayoutLMv3
+    [microsoft/layoutlmv3-base](https://huggingface.co/microsoft/layoutlmv3-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the LayoutLMv3 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`LayoutLMv3Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv3Model`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum value that the 2D position embedding might ever be used with. Typically set this to something
+            large just in case (e.g., 1024).
+        coordinate_size (`int`, *optional*, defaults to `128`):
+            Dimension of the coordinate embeddings.
+        shape_size (`int`, *optional*, defaults to `128`):
+            Dimension of the width and height embeddings.
+        has_relative_attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a relative attention bias in the self-attention mechanism.
+        rel_pos_bins (`int`, *optional*, defaults to 32):
+            The number of relative position bins to be used in the self-attention mechanism.
+        max_rel_pos (`int`, *optional*, defaults to 128):
+            The maximum number of relative positions to be used in the self-attention mechanism.
+        max_rel_2d_pos (`int`, *optional*, defaults to 256):
+            The maximum number of relative 2D positions in the self-attention mechanism.
+        rel_2d_pos_bins (`int`, *optional*, defaults to 64):
+            The number of 2D relative position bins in the self-attention mechanism.
+        has_spatial_attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a spatial attention bias in the self-attention mechanism.
+        visual_embed (`bool`, *optional*, defaults to `True`):
+            Whether or not to add patch embeddings.
+        input_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of the images.
+        num_channels (`int`, *optional*, defaults to `3`):
+            The number of channels of the images.
+        patch_size (`int`, *optional*, defaults to `16`)
+            The size (resolution) of the patches.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Example:
+
+    ```python
+    >>> from transformers import LayoutLMv3Config, LayoutLMv3Model
+
+    >>> # Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration
+    >>> configuration = LayoutLMv3Config()
+
+    >>> # Initializing a model (with random weights) from the microsoft/layoutlmv3-base style configuration
+    >>> model = LayoutLMv3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "layoutlmv3"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        max_2d_position_embeddings=1024,
+        coordinate_size=128,
+        shape_size=128,
+        has_relative_attention_bias=True,
+        rel_pos_bins=32,
+        max_rel_pos=128,
+        rel_2d_pos_bins=64,
+        max_rel_2d_pos=256,
+        has_spatial_attention_bias=True,
+        text_embed=True,
+        visual_embed=True,
+        input_size=224,
+        num_channels=3,
+        patch_size=16,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            hidden_dropout_prob=hidden_dropout_prob,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            max_position_embeddings=max_position_embeddings,
+            type_vocab_size=type_vocab_size,
+            initializer_range=initializer_range,
+            layer_norm_eps=layer_norm_eps,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.coordinate_size = coordinate_size
+        self.shape_size = shape_size
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.rel_pos_bins = rel_pos_bins
+        self.max_rel_pos = max_rel_pos
+        self.has_spatial_attention_bias = has_spatial_attention_bias
+        self.rel_2d_pos_bins = rel_2d_pos_bins
+        self.max_rel_2d_pos = max_rel_2d_pos
+        self.text_embed = text_embed
+        self.visual_embed = visual_embed
+        self.input_size = input_size
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.classifier_dropout = classifier_dropout
+
+
+class LayoutLMv3OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.12")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        # The order of inputs is different for question answering and sequence classification
+        if self.task in ["question-answering", "sequence-classification"]:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("bbox", {0: "batch", 1: "sequence"}),
+                    ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ]
+            )
+        else:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("bbox", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("pixel_values", {0: "batch", 1: "num_channels"}),
+                ]
+            )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        num_channels: int = 3,
+        image_width: int = 40,
+        image_height: int = 40,
+    ) -> Mapping[str, Any]:
+        """
+        Generate inputs to provide to the ONNX exporter for the specific framework
+
+        Args:
+            processor ([`ProcessorMixin`]):
+                The processor associated with this model configuration.
+            batch_size (`int`, *optional*, defaults to -1):
+                The batch size to export the model for (-1 means dynamic axis).
+            seq_length (`int`, *optional*, defaults to -1):
+                The sequence length to export the model for (-1 means dynamic axis).
+            is_pair (`bool`, *optional*, defaults to `False`):
+                Indicate if the input is a pair (sentence 1, sentence 2).
+            framework (`TensorType`, *optional*, defaults to `None`):
+                The framework (PyTorch or TensorFlow) that the processor will generate tensors for.
+            num_channels (`int`, *optional*, defaults to 3):
+                The number of channels of the generated images.
+            image_width (`int`, *optional*, defaults to 40):
+                The width of the generated images.
+            image_height (`int`, *optional*, defaults to 40):
+                The height of the generated images.
+
+        Returns:
+            Mapping[str, Any]: holding the kwargs to provide to the model's forward function
+        """
+
+        # A dummy image is used so OCR should not be applied
+        setattr(processor.image_processor, "apply_ocr", False)
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = processor.tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_text = [[" ".join([processor.tokenizer.unk_token]) * seq_length]] * batch_size
+
+        # Generate dummy bounding boxes
+        dummy_bboxes = [[[48, 84, 73, 128]]] * batch_size
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        # batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
+        dummy_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
+
+        inputs = dict(
+            processor(
+                dummy_image,
+                text=dummy_text,
+                boxes=dummy_bboxes,
+                return_tensors=framework,
+            )
+        )
+
+        return inputs
+
+
+__all__ = ["LayoutLMv3Config", "LayoutLMv3OnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ea779a48f12bf1f0036a3a02cd018adabebc2fc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py
@@ -0,0 +1,40 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for LayoutLMv3.
+"""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class LayoutLMv3FeatureExtractor(LayoutLMv3ImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class LayoutLMv3FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use LayoutLMv3ImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["LayoutLMv3FeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6b690bb619db4435be428998f9ffc8e50bbb8c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -0,0 +1,380 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LayoutLMv3."""
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_pytesseract_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from ...utils.import_utils import requires
+
+
+if is_vision_available():
+    import PIL
+
+# soft dependency
+if is_pytesseract_available():
+    import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_box(box, width, height):
+    return [
+        int(1000 * (box[0] / width)),
+        int(1000 * (box[1] / height)),
+        int(1000 * (box[2] / width)),
+        int(1000 * (box[3] / height)),
+    ]
+
+
+def apply_tesseract(
+    image: np.ndarray,
+    lang: Optional[str],
+    tesseract_config: Optional[str],
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
+
+    # apply OCR
+    pil_image = to_pil_image(image, input_data_format=input_data_format)
+    image_width, image_height = pil_image.size
+    data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
+    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+
+    # filter empty words and corresponding coordinates
+    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
+    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
+    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
+    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
+    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
+    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
+
+    # turn coordinates into (left, top, left+width, top+height) format
+    actual_boxes = []
+    for x, y, w, h in zip(left, top, width, height):
+        actual_box = [x, y, x + w, y + h]
+        actual_boxes.append(actual_box)
+
+    # finally, normalize the bounding boxes
+    normalized_boxes = []
+    for box in actual_boxes:
+        normalized_boxes.append(normalize_box(box, image_width, image_height))
+
+    assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
+
+    return words, normalized_boxes
+
+
+@requires(backends=("vision",))
+class LayoutLMv3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LayoutLMv3 image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
+            overridden by `do_resize` in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image's pixel values by the specified `rescale_value`. Can be overridden by
+            `do_rescale` in `preprocess`.
+        rescale_factor (`float`, *optional*, defaults to 1 / 255):
+            Value by which the image's pixel values are rescaled. Can be overridden by `rescale_factor` in
+            `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        apply_ocr (`bool`, *optional*, defaults to `True`):
+            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+            the `apply_ocr` parameter in the `preprocess` method.
+        ocr_lang (`str`, *optional*):
+            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+            `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_value: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        apply_ocr: bool = True,
+        ocr_lang: Optional[str] = None,
+        tesseract_config: Optional[str] = "",
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_value
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.apply_ocr = apply_ocr
+        self.ocr_lang = ocr_lang
+        self.tesseract_config = tesseract_config
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        apply_ocr: Optional[bool] = None,
+        ocr_lang: Optional[str] = None,
+        tesseract_config: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Desired size of the output image after applying `resize`.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` filters.
+                Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image pixel values between [0, 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to apply to the image pixel values. Only has an effect if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `Iterable[float]`, *optional*, defaults to `self.image_mean`):
+                Mean values to be used for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `Iterable[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation values to be used for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`):
+                Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
+            ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`):
+                The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+                used.
+            tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`):
+                Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+                Tesseract.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        apply_ocr = apply_ocr if apply_ocr is not None else self.apply_ocr
+        ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang
+        tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # Tesseract OCR to get words + normalized bounding boxes
+        if apply_ocr:
+            requires_backends(self, "pytesseract")
+            words_batch = []
+            boxes_batch = []
+            for image in images:
+                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
+                words_batch.append(words)
+                boxes_batch.append(boxes)
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        if apply_ocr:
+            data["words"] = words_batch
+            data["boxes"] = boxes_batch
+        return data
+
+
+__all__ = ["LayoutLMv3ImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7580bb528da3ac8b0558990643753f0862be605
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for LayoutLMv3."""
+
+from typing import Optional, Union
+
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+    requires_backends,
+)
+from .image_processing_layoutlmv3 import apply_tesseract
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class LayoutLMv3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    Args:
+        apply_ocr (`bool`, *optional*, defaults to `True`):
+            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+            the `apply_ocr` parameter in the `preprocess` method.
+        ocr_lang (`str`, *optional*):
+            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+            `preprocess` method.
+    """
+
+    apply_ocr: Optional[bool]
+    ocr_lang: Optional[str]
+    tesseract_config: Optional[str]
+
+
+@auto_docstring
+class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    apply_ocr = True
+    ocr_lang = None
+    tesseract_config = ""
+    valid_kwargs = LayoutLMv3FastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        apply_ocr: bool,
+        ocr_lang: Optional[str],
+        tesseract_config: Optional[str],
+        return_tensors: Optional[Union[str, TensorType]],
+        disable_grouping: Optional[bool],
+        **kwargs,
+    ) -> BatchFeature:
+        # Tesseract OCR to get words + normalized bounding boxes
+        if apply_ocr:
+            requires_backends(self, "pytesseract")
+            words_batch = []
+            boxes_batch = []
+            for image in images:
+                if image.is_cuda:
+                    logger.warning_once(
+                        "apply_ocr can only be performed on cpu. Tensors will be transferred to cpu before processing."
+                    )
+                words, boxes = apply_tesseract(
+                    image.cpu(), ocr_lang, tesseract_config, input_data_format=ChannelDimension.FIRST
+                )
+                words_batch.append(words)
+                boxes_batch.append(boxes)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        data = BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+        if apply_ocr:
+            data["words"] = words_batch
+            data["boxes"] = boxes_batch
+
+        return data
+
+
+__all__ = ["LayoutLMv3ImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd8b525bb4275f19a4e28578b61971475e8bc4be
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -0,0 +1,1254 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LayoutLMv3 model."""
+
+import collections
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward
+from ...utils import (
+    auto_docstring,
+    logging,
+    torch_int,
+)
+from .configuration_layoutlmv3 import LayoutLMv3Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class LayoutLMv3PatchEmbeddings(nn.Module):
+    """LayoutLMv3 image (patch) embeddings. This class also automatically interpolates the position embeddings for varying
+    image sizes."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        image_size = (
+            config.input_size
+            if isinstance(config.input_size, collections.abc.Iterable)
+            else (config.input_size, config.input_size)
+        )
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+        self.patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.proj = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values, position_embedding=None):
+        embeddings = self.proj(pixel_values)
+
+        if position_embedding is not None:
+            # interpolate the position embedding to the corresponding size
+            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1)
+            position_embedding = position_embedding.permute(0, 3, 1, 2)
+            patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+            position_embedding = F.interpolate(position_embedding, size=(patch_height, patch_width), mode="bicubic")
+            embeddings = embeddings + position_embedding
+
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class LayoutLMv3TextEmbeddings(nn.Module):
+    """
+    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
+
+    def calculate_spatial_position_embeddings(self, bbox):
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The `bbox` coordinate values should be within 0-1000 range.") from e
+
+        h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023))
+        w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023))
+
+        # below is the difference between LayoutLMEmbeddingsV2 (torch.cat) and LayoutLMEmbeddingsV1 (add)
+        spatial_position_embeddings = torch.cat(
+            [
+                left_position_embeddings,
+                upper_position_embeddings,
+                right_position_embeddings,
+                lower_position_embeddings,
+                h_position_embeddings,
+                w_position_embeddings,
+            ],
+            dim=-1,
+        )
+        return spatial_position_embeddings
+
+    def create_position_ids_from_input_ids(self, input_ids, padding_idx):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask)) * mask
+        return incremental_indices.long() + padding_idx
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx).to(
+                    input_ids.device
+                )
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+
+        spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)
+
+        embeddings = embeddings + spatial_position_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+@auto_docstring
+class LayoutLMv3PreTrainedModel(PreTrainedModel):
+    config: LayoutLMv3Config
+    base_model_prefix = "layoutlmv3"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, LayoutLMv3Model):
+            if self.config.visual_embed:
+                module.cls_token.data.zero_()
+                module.pos_embed.data.zero_()
+
+
+class LayoutLMv3SelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+    def cogview_attention(self, attention_scores, alpha=32):
+        """
+        https://huggingface.co/papers/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
+        (PB-Relax). A replacement of the original nn.Softmax(dim=-1)(attention_scores). Seems the new attention_probs
+        will result in a slower speed and a little bias. Can use torch.allclose(standard_attention_probs,
+        cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better.
+        """
+        scaled_attention_scores = attention_scores / alpha
+        max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1)
+        new_attention_scores = (scaled_attention_scores - max_value) * alpha
+        return nn.Softmax(dim=-1)(new_attention_scores)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # The attention scores QT K/√d could be significantly larger than input elements, and result in overflow.
+        # Changing the computational order into QT(K/√d) alleviates the problem. (https://huggingface.co/papers/2105.13290)
+        attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2))
+
+        if self.has_relative_attention_bias and self.has_spatial_attention_bias:
+            attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size)
+        elif self.has_relative_attention_bias:
+            attention_scores += rel_pos / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        # Use the trick of the CogView paper to stabilize training
+        attention_probs = self.cogview_attention(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput
+class LayoutLMv3SelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.layoutlmv2.modeling_layoutlmv2.LayoutLMv2Attention with LayoutLMv2->LayoutLMv3
+class LayoutLMv3Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = LayoutLMv3SelfAttention(config)
+        self.output = LayoutLMv3SelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.layoutlmv2.modeling_layoutlmv2.LayoutLMv2Layer with LayoutLMv2->LayoutLMv3
+class LayoutLMv3Layer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = LayoutLMv3Attention(config)
+        self.intermediate = LayoutLMv3Intermediate(config)
+        self.output = LayoutLMv3Output(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class LayoutLMv3Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if self.has_relative_attention_bias:
+            self.rel_pos_bins = config.rel_pos_bins
+            self.max_rel_pos = config.max_rel_pos
+            self.rel_pos_bias = nn.Linear(self.rel_pos_bins, config.num_attention_heads, bias=False)
+
+        if self.has_spatial_attention_bias:
+            self.max_rel_2d_pos = config.max_rel_2d_pos
+            self.rel_2d_pos_bins = config.rel_2d_pos_bins
+            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
+            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_bins, config.num_attention_heads, bias=False)
+
+    def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        ret = 0
+        if bidirectional:
+            num_buckets //= 2
+            ret += (relative_position > 0).long() * num_buckets
+            n = torch.abs(relative_position)
+        else:
+            n = torch.max(-relative_position, torch.zeros_like(relative_position))
+        # now n is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).to(torch.long)
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    def _cal_1d_pos_emb(self, position_ids):
+        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
+
+        rel_pos = self.relative_position_bucket(
+            rel_pos_mat,
+            num_buckets=self.rel_pos_bins,
+            max_distance=self.max_rel_pos,
+        )
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
+        rel_pos = rel_pos.contiguous()
+        return rel_pos
+
+    def _cal_2d_pos_emb(self, bbox):
+        position_coord_x = bbox[:, :, 0]
+        position_coord_y = bbox[:, :, 3]
+        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
+        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
+        rel_pos_x = self.relative_position_bucket(
+            rel_pos_x_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_y = self.relative_position_bucket(
+            rel_pos_y_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
+            rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
+        rel_pos_x = rel_pos_x.contiguous()
+        rel_pos_y = rel_pos_y.contiguous()
+        rel_2d_pos = rel_pos_x + rel_pos_y
+        return rel_2d_pos
+
+    def forward(
+        self,
+        hidden_states,
+        bbox=None,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        position_ids=None,
+        patch_height=None,
+        patch_width=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                output_attentions,
+                rel_pos=rel_pos,
+                rel_2d_pos=rel_2d_pos,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate
+class LayoutLMv3Intermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput
+class LayoutLMv3Output(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+@auto_docstring
+class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        if config.text_embed:
+            self.embeddings = LayoutLMv3TextEmbeddings(config)
+
+        if config.visual_embed:
+            # use the default pre-training parameters for fine-tuning (e.g., input_size)
+            # when the input_size is larger in fine-tuning, we will interpolate the position embeddings in forward
+            self.patch_embed = LayoutLMv3PatchEmbeddings(config)
+
+            size = int(config.input_size / config.patch_size)
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+            self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, config.hidden_size))
+            self.pos_drop = nn.Dropout(p=0.0)
+
+            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+            self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+                self.init_visual_bbox(image_size=(size, size))
+
+            self.norm = nn.LayerNorm(config.hidden_size, eps=1e-6)
+
+        self.encoder = LayoutLMv3Encoder(config)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def init_visual_bbox(self, image_size=(14, 14), max_len=1000):
+        """
+        Create the bounding boxes for the visual (patch) tokens.
+        """
+        visual_bbox_x = torch.div(
+            torch.arange(0, max_len * (image_size[1] + 1), max_len), image_size[1], rounding_mode="trunc"
+        )
+        visual_bbox_y = torch.div(
+            torch.arange(0, max_len * (image_size[0] + 1), max_len), image_size[0], rounding_mode="trunc"
+        )
+        visual_bbox = torch.stack(
+            [
+                visual_bbox_x[:-1].repeat(image_size[0], 1),
+                visual_bbox_y[:-1].repeat(image_size[1], 1).transpose(0, 1),
+                visual_bbox_x[1:].repeat(image_size[0], 1),
+                visual_bbox_y[1:].repeat(image_size[1], 1).transpose(0, 1),
+            ],
+            dim=-1,
+        ).view(-1, 4)
+
+        cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
+        self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)
+
+    def calculate_visual_bbox(self, device, dtype, batch_size):
+        visual_bbox = self.visual_bbox.repeat(batch_size, 1, 1)
+        visual_bbox = visual_bbox.to(device).type(dtype)
+        return visual_bbox
+
+    def forward_image(self, pixel_values):
+        embeddings = self.patch_embed(pixel_values)
+
+        # add [CLS] token
+        batch_size, seq_len, _ = embeddings.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add position embeddings
+        if self.pos_embed is not None:
+            embeddings = embeddings + self.pos_embed
+
+        embeddings = self.pos_drop(embeddings)
+        embeddings = self.norm(embeddings)
+
+        return embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        bbox (`torch.LongTensor` of shape `(batch_size, token_sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, token_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")
+
+        >>> outputs = model(**encoding)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif pixel_values is not None:
+            batch_size = len(pixel_values)
+            device = pixel_values.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or pixel_values")
+
+        if input_ids is not None or inputs_embeds is not None:
+            if attention_mask is None:
+                attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+            if token_type_ids is None:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+            if bbox is None:
+                bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
+
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                bbox=bbox,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                inputs_embeds=inputs_embeds,
+            )
+
+        final_bbox = final_position_ids = None
+        patch_height = patch_width = None
+        if pixel_values is not None:
+            patch_height, patch_width = (
+                torch_int(pixel_values.shape[2] / self.config.patch_size),
+                torch_int(pixel_values.shape[3] / self.config.patch_size),
+            )
+            visual_embeddings = self.forward_image(pixel_values)
+            visual_attention_mask = torch.ones(
+                (batch_size, visual_embeddings.shape[1]), dtype=torch.long, device=device
+            )
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1)
+            else:
+                attention_mask = visual_attention_mask
+
+            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+                if self.config.has_spatial_attention_bias:
+                    visual_bbox = self.calculate_visual_bbox(device, dtype=torch.long, batch_size=batch_size)
+                    if bbox is not None:
+                        final_bbox = torch.cat([bbox, visual_bbox], dim=1)
+                    else:
+                        final_bbox = visual_bbox
+
+                visual_position_ids = torch.arange(
+                    0, visual_embeddings.shape[1], dtype=torch.long, device=device
+                ).repeat(batch_size, 1)
+                if input_ids is not None or inputs_embeds is not None:
+                    position_ids = torch.arange(0, input_shape[1], device=device).unsqueeze(0)
+                    position_ids = position_ids.expand(input_shape)
+                    final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1)
+                else:
+                    final_position_ids = visual_position_ids
+
+            if input_ids is not None or inputs_embeds is not None:
+                embedding_output = torch.cat([embedding_output, visual_embeddings], dim=1)
+            else:
+                embedding_output = visual_embeddings
+
+            embedding_output = self.LayerNorm(embedding_output)
+            embedding_output = self.dropout(embedding_output)
+        elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+            if self.config.has_spatial_attention_bias:
+                final_bbox = bbox
+            if self.config.has_relative_attention_bias:
+                position_ids = self.embeddings.position_ids[:, : input_shape[1]]
+                position_ids = position_ids.expand_as(input_ids)
+                final_position_ids = position_ids
+
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, None, device, dtype=embedding_output.dtype
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            bbox=final_bbox,
+            position_ids=final_position_ids,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            patch_height=patch_height,
+            patch_width=patch_width,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class LayoutLMv3ClassificationHead(nn.Module):
+    """
+    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
+    """
+
+    def __init__(self, config, pool_feature=False):
+        super().__init__()
+        self.pool_feature = pool_feature
+        if pool_feature:
+            self.dense = nn.Linear(config.hidden_size * 3, config.hidden_size)
+        else:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, x):
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
+    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
+    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
+    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
+    """
+)
+class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.layoutlmv3 = LayoutLMv3Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if config.num_labels < 10:
+            self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        else:
+            self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
+
+        self.init_weights()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModelForTokenClassification
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+        >>> word_labels = example["ner_tags"]
+
+        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
+
+        >>> outputs = model(**encoding)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            pixel_values=pixel_values,
+        )
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+        # only take the text part of the output representations
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.layoutlmv3 = LayoutLMv3Model(config)
+        self.qa_outputs = LayoutLMv3ClassificationHead(config, pool_feature=False)
+
+        self.init_weights()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
+        >>> from datasets import load_dataset
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> question = "what's his name?"
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs.loss
+        >>> start_scores = outputs.start_logits
+        >>> end_scores = outputs.end_logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            bbox=bbox,
+            pixel_values=pixel_values,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
+    [CLS] token) e.g. for document image classification tasks such as the
+    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
+    """
+)
+class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.layoutlmv3 = LayoutLMv3Model(config)
+        self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
+
+        self.init_weights()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        bbox: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModelForSequenceClassification
+        >>> from datasets import load_dataset
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")
+        >>> sequence_label = torch.tensor([1])
+
+        >>> outputs = model(**encoding, labels=sequence_label)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            bbox=bbox,
+            pixel_values=pixel_values,
+        )
+
+        sequence_output = outputs[0][:, 0, :]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "LayoutLMv3ForQuestionAnswering",
+    "LayoutLMv3ForSequenceClassification",
+    "LayoutLMv3ForTokenClassification",
+    "LayoutLMv3Model",
+    "LayoutLMv3PreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0586d58835ee372bf2c307564fc8593c3c3b57e
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -0,0 +1,1767 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 LayoutLMv3 model."""
+
+from __future__ import annotations
+
+import collections
+import math
+
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from .configuration_layoutlmv3 import LayoutLMv3Config
+
+
+_CONFIG_FOR_DOC = "LayoutLMv3Config"
+
+_DUMMY_INPUT_IDS = [
+    [7, 6, 1],
+    [1, 2, 0],
+]
+
+_DUMMY_BBOX = [
+    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+    [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]],
+]
+
+
+LARGE_NEGATIVE = -1e8
+
+
+class TFLayoutLMv3PatchEmbeddings(keras.layers.Layer):
+    """LayoutLMv3 image (patch) embeddings."""
+
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+        patch_sizes = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+        self.proj = keras.layers.Conv2D(
+            filters=config.hidden_size,
+            kernel_size=patch_sizes,
+            strides=patch_sizes,
+            padding="valid",
+            data_format="channels_last",
+            use_bias=True,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="proj",
+        )
+        self.hidden_size = config.hidden_size
+        self.num_patches = (config.input_size**2) // (patch_sizes[0] * patch_sizes[1])
+        self.config = config
+
+    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1])
+
+        embeddings = self.proj(pixel_values)
+        embeddings = tf.reshape(embeddings, (-1, self.num_patches, self.hidden_size))
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, None, self.config.num_channels])
+
+
+class TFLayoutLMv3TextEmbeddings(keras.layers.Layer):
+    """
+    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
+    """
+
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+        self.word_embeddings = keras.layers.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="word_embeddings",
+        )
+        self.token_type_embeddings = keras.layers.Embedding(
+            config.type_vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="token_type_embeddings",
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.padding_token_index = config.pad_token_id
+        self.position_embeddings = keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="position_embeddings",
+        )
+        self.x_position_embeddings = keras.layers.Embedding(
+            config.max_2d_position_embeddings,
+            config.coordinate_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="x_position_embeddings",
+        )
+        self.y_position_embeddings = keras.layers.Embedding(
+            config.max_2d_position_embeddings,
+            config.coordinate_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="y_position_embeddings",
+        )
+        self.h_position_embeddings = keras.layers.Embedding(
+            config.max_2d_position_embeddings,
+            config.shape_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="h_position_embeddings",
+        )
+        self.w_position_embeddings = keras.layers.Embedding(
+            config.max_2d_position_embeddings,
+            config.shape_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="w_position_embeddings",
+        )
+        self.max_2d_positions = config.max_2d_position_embeddings
+        self.config = config
+
+    def calculate_spatial_position_embeddings(self, bbox: tf.Tensor) -> tf.Tensor:
+        try:
+            left_position_ids = bbox[:, :, 0]
+            upper_position_ids = bbox[:, :, 1]
+            right_position_ids = bbox[:, :, 2]
+            lower_position_ids = bbox[:, :, 3]
+        except IndexError as exception:
+            raise IndexError("Bounding box is not of shape (batch_size, seq_length, 4).") from exception
+
+        try:
+            left_position_embeddings = self.x_position_embeddings(left_position_ids)
+            upper_position_embeddings = self.y_position_embeddings(upper_position_ids)
+            right_position_embeddings = self.x_position_embeddings(right_position_ids)
+            lower_position_embeddings = self.y_position_embeddings(lower_position_ids)
+        except IndexError as exception:
+            raise IndexError(
+                f"The `bbox` coordinate values should be within 0-{self.max_2d_positions} range."
+            ) from exception
+
+        max_position_id = self.max_2d_positions - 1
+        h_position_embeddings = self.h_position_embeddings(
+            tf.clip_by_value(bbox[:, :, 3] - bbox[:, :, 1], 0, max_position_id)
+        )
+        w_position_embeddings = self.w_position_embeddings(
+            tf.clip_by_value(bbox[:, :, 2] - bbox[:, :, 0], 0, max_position_id)
+        )
+
+        # LayoutLMv1 sums the spatial embeddings, but LayoutLMv3 concatenates them.
+        spatial_position_embeddings = tf.concat(
+            [
+                left_position_embeddings,
+                upper_position_embeddings,
+                right_position_embeddings,
+                lower_position_embeddings,
+                h_position_embeddings,
+                w_position_embeddings,
+            ],
+            axis=-1,
+        )
+        return spatial_position_embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embds: tf.Tensor) -> tf.Tensor:
+        """
+        We are provided embeddings directly. We cannot infer which are padded, so just generate sequential position
+        ids.
+        """
+        input_shape = tf.shape(inputs_embds)
+        sequence_length = input_shape[1]
+        start_index = self.padding_token_index + 1
+        end_index = self.padding_token_index + sequence_length + 1
+        position_ids = tf.range(start_index, end_index, dtype=tf.int32)
+        batch_size = input_shape[0]
+        position_ids = tf.reshape(position_ids, (1, sequence_length))
+        position_ids = tf.tile(position_ids, (batch_size, 1))
+        return position_ids
+
+    def create_position_ids_from_input_ids(self, input_ids: tf.Tensor) -> tf.Tensor:
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_token_index + 1.
+        """
+        mask = tf.cast(tf.not_equal(input_ids, self.padding_token_index), input_ids.dtype)
+        position_ids = tf.cumsum(mask, axis=1) * mask
+        position_ids = position_ids + self.padding_token_index
+        return position_ids
+
+    def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) -> tf.Tensor:
+        if input_ids is None:
+            return self.create_position_ids_from_inputs_embeds(inputs_embeds)
+        else:
+            return self.create_position_ids_from_input_ids(input_ids)
+
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tf.Tensor:
+        if position_ids is None:
+            position_ids = self.create_position_ids(input_ids, inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = tf.shape(input_ids)
+        else:
+            input_shape = tf.shape(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.zeros(input_shape, dtype=position_ids.dtype)
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.word_embeddings.input_dim)
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+
+        spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)
+
+        embeddings += spatial_position_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "word_embeddings", None) is not None:
+            with tf.name_scope(self.word_embeddings.name):
+                self.word_embeddings.build(None)
+        if getattr(self, "token_type_embeddings", None) is not None:
+            with tf.name_scope(self.token_type_embeddings.name):
+                self.token_type_embeddings.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "position_embeddings", None) is not None:
+            with tf.name_scope(self.position_embeddings.name):
+                self.position_embeddings.build(None)
+        if getattr(self, "x_position_embeddings", None) is not None:
+            with tf.name_scope(self.x_position_embeddings.name):
+                self.x_position_embeddings.build(None)
+        if getattr(self, "y_position_embeddings", None) is not None:
+            with tf.name_scope(self.y_position_embeddings.name):
+                self.y_position_embeddings.build(None)
+        if getattr(self, "h_position_embeddings", None) is not None:
+            with tf.name_scope(self.h_position_embeddings.name):
+                self.h_position_embeddings.build(None)
+        if getattr(self, "w_position_embeddings", None) is not None:
+            with tf.name_scope(self.w_position_embeddings.name):
+                self.w_position_embeddings.build(None)
+
+
+class TFLayoutLMv3SelfAttention(keras.layers.Layer):
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.attention_score_normaliser = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            self.all_head_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="query",
+        )
+        self.key = keras.layers.Dense(
+            self.all_head_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="key",
+        )
+        self.value = keras.layers.Dense(
+            self.all_head_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="value",
+        )
+
+        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+        self.config = config
+
+    def transpose_for_scores(self, x: tf.Tensor):
+        shape = tf.shape(x)
+        new_shape = (
+            shape[0],  # batch_size
+            shape[1],  # seq_length
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = tf.reshape(x, new_shape)
+        return tf.transpose(x, perm=[0, 2, 1, 3])  # batch_size, num_heads, seq_length, attention_head_size
+
+    def cogview_attention(self, attention_scores: tf.Tensor, alpha: float | int = 32):
+        """
+        https://huggingface.co/papers/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
+        (PB-Relax). A replacement of the original keras.layers.Softmax(axis=-1)(attention_scores). Seems the new
+        attention_probs will result in a slower speed and a little bias. Can use
+        tf.debugging.assert_near(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison. The
+        smaller atol (e.g., 1e-08), the better.
+        """
+        scaled_attention_scores = attention_scores / alpha
+        max_value = tf.expand_dims(tf.reduce_max(scaled_attention_scores, axis=-1), axis=-1)
+        new_attention_scores = (scaled_attention_scores - max_value) * alpha
+        return tf.math.softmax(new_attention_scores, axis=-1)
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
+        output_attentions: bool,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | tuple[tf.Tensor, tf.Tensor]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        normalised_query_layer = query_layer / self.attention_score_normaliser
+        transposed_key_layer = tf.transpose(
+            key_layer, perm=[0, 1, 3, 2]
+        )  # batch_size, num_heads, attention_head_size, seq_length
+        attention_scores = tf.matmul(normalised_query_layer, transposed_key_layer)
+
+        if self.has_relative_attention_bias and self.has_spatial_attention_bias:
+            attention_scores += (rel_pos + rel_2d_pos) / self.attention_score_normaliser
+        elif self.has_relative_attention_bias:
+            attention_scores += rel_pos / self.attention_score_normaliser
+
+        if attention_mask is not None:
+            # Apply the attention mask (is precomputed for all layers in TFLayoutLMv3Model call() function)
+            attention_scores += attention_mask
+
+        # Normalize the attention scores to probabilities.
+        # Use the trick of CogView paper to stabilize training.
+        attention_probs = self.cogview_attention(attention_scores)
+
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to.
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(
+            context_layer, perm=[0, 2, 1, 3]
+        )  # batch_size, seq_length, num_heads, attention_head_size
+        shape = tf.shape(context_layer)
+        context_layer = tf.reshape(
+            context_layer, (shape[0], shape[1], self.all_head_size)
+        )  # batch_size, seq_length, num_heads * attention_head_size
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from models.roberta.modeling_tf_roberta.TFRobertaSelfOutput
+class TFLayoutLMv3SelfOutput(keras.layers.Layer):
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFLayoutLMv3Attention(keras.layers.Layer):
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+        self.self_attention = TFLayoutLMv3SelfAttention(config, name="self")
+        self.self_output = TFLayoutLMv3SelfOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
+        output_attentions: bool,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | tuple[tf.Tensor, tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions,
+            rel_pos,
+            rel_2d_pos,
+            training=training,
+        )
+        attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "self_output", None) is not None:
+            with tf.name_scope(self.self_output.name):
+                self.self_output.build(None)
+
+
+# Copied from models.roberta.modeling_tf_bert.TFRobertaIntermediate
+class TFLayoutLMv3Intermediate(keras.layers.Layer):
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from models.roberta.modeling_tf_bert.TFRobertaOutput
+class TFLayoutLMv3Output(keras.layers.Layer):
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFLayoutLMv3Layer(keras.layers.Layer):
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFLayoutLMv3Attention(config, name="attention")
+        self.intermediate = TFLayoutLMv3Intermediate(config, name="intermediate")
+        self.bert_output = TFLayoutLMv3Output(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
+        output_attentions: bool,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | tuple[tf.Tensor, tf.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.bert_output(intermediate_output, attention_output, training=training)
+        outputs = (layer_output,) + outputs
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+
+
+class TFLayoutLMv3Encoder(keras.layers.Layer):
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer = [TFLayoutLMv3Layer(config, name=f"layer.{i}") for i in range(config.num_hidden_layers)]
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if self.has_relative_attention_bias:
+            self.rel_pos_bins = config.rel_pos_bins
+            self.max_rel_pos = config.max_rel_pos
+            self.rel_pos_bias = keras.layers.Dense(
+                units=config.num_attention_heads,
+                kernel_initializer=get_initializer(config.initializer_range),
+                use_bias=False,
+                name="rel_pos_bias",
+            )
+
+        if self.has_spatial_attention_bias:
+            self.max_rel_2d_pos = config.max_rel_2d_pos
+            self.rel_2d_pos_bins = config.rel_2d_pos_bins
+            self.rel_pos_x_bias = keras.layers.Dense(
+                units=config.num_attention_heads,
+                kernel_initializer=get_initializer(config.initializer_range),
+                use_bias=False,
+                name="rel_pos_x_bias",
+            )
+            self.rel_pos_y_bias = keras.layers.Dense(
+                units=config.num_attention_heads,
+                kernel_initializer=get_initializer(config.initializer_range),
+                use_bias=False,
+                name="rel_pos_y_bias",
+            )
+
+    def relative_position_bucket(self, relative_positions: tf.Tensor, num_buckets: int, max_distance: int):
+        # the negative relative positions are assigned to the interval [0, num_buckets / 2]
+        # we deal with this by assigning absolute relative positions to the interval [0, num_buckets / 2]
+        # and then offsetting the positive relative positions by num_buckets / 2 at the end
+        num_buckets = num_buckets // 2
+        buckets = tf.abs(relative_positions)
+
+        # half of the buckets are for exact increments in positions
+        max_exact_buckets = num_buckets // 2
+        is_small = buckets < max_exact_buckets
+
+        # the other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        buckets_log_ratio = tf.math.log(tf.cast(buckets, tf.float32) / max_exact_buckets)
+        distance_log_ratio = math.log(max_distance / max_exact_buckets)
+        buckets_big_offset = (
+            buckets_log_ratio / distance_log_ratio * (num_buckets - max_exact_buckets)
+        )  # scale is [0, num_buckets - max_exact_buckets]
+        buckets_big = max_exact_buckets + buckets_big_offset  # scale is [max_exact_buckets, num_buckets]
+        buckets_big = tf.cast(buckets_big, buckets.dtype)
+        buckets_big = tf.minimum(buckets_big, num_buckets - 1)
+
+        return (tf.cast(relative_positions > 0, buckets.dtype) * num_buckets) + tf.where(
+            is_small, buckets, buckets_big
+        )
+
+    def _cal_pos_emb(
+        self,
+        dense_layer: keras.layers.Dense,
+        position_ids: tf.Tensor,
+        num_buckets: int,
+        max_distance: int,
+    ):
+        rel_pos_matrix = tf.expand_dims(position_ids, axis=-2) - tf.expand_dims(position_ids, axis=-1)
+        rel_pos = self.relative_position_bucket(rel_pos_matrix, num_buckets, max_distance)
+        rel_pos_one_hot = tf.one_hot(rel_pos, depth=num_buckets, dtype=self.compute_dtype)
+        embedding = dense_layer(rel_pos_one_hot)
+        # batch_size, seq_length, seq_length, num_heads --> batch_size, num_heads, seq_length, seq_length
+        embedding = tf.transpose(embedding, [0, 3, 1, 2])
+        embedding = tf.cast(embedding, dtype=self.compute_dtype)
+        return embedding
+
+    def _cal_1d_pos_emb(self, position_ids: tf.Tensor):
+        return self._cal_pos_emb(self.rel_pos_bias, position_ids, self.rel_pos_bins, self.max_rel_pos)
+
+    def _cal_2d_pos_emb(self, bbox: tf.Tensor):
+        position_coord_x = bbox[:, :, 0]  # left
+        position_coord_y = bbox[:, :, 3]  # bottom
+        rel_pos_x = self._cal_pos_emb(
+            self.rel_pos_x_bias,
+            position_coord_x,
+            self.rel_2d_pos_bins,
+            self.max_rel_2d_pos,
+        )
+        rel_pos_y = self._cal_pos_emb(
+            self.rel_pos_y_bias,
+            position_coord_y,
+            self.rel_2d_pos_bins,
+            self.max_rel_2d_pos,
+        )
+        rel_2d_pos = rel_pos_x + rel_pos_y
+        return rel_2d_pos
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        position_ids: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor] | tuple[tf.Tensor, tf.Tensor] | tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        rel_pos = self._cal_1d_pos_emb(position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._cal_2d_pos_emb(bbox) if self.has_spatial_attention_bias else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                output_attentions,
+                rel_pos=rel_pos,
+                rel_2d_pos=rel_2d_pos,
+                training=training,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if return_dict:
+            return TFBaseModelOutput(
+                last_hidden_state=hidden_states,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attentions,
+            )
+        else:
+            return tuple(
+                value for value in [hidden_states, all_hidden_states, all_self_attentions] if value is not None
+            )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rel_pos_bias", None) is not None:
+            with tf.name_scope(self.rel_pos_bias.name):
+                self.rel_pos_bias.build([None, None, self.rel_pos_bins])
+        if getattr(self, "rel_pos_x_bias", None) is not None:
+            with tf.name_scope(self.rel_pos_x_bias.name):
+                self.rel_pos_x_bias.build([None, None, self.rel_2d_pos_bins])
+        if getattr(self, "rel_pos_y_bias", None) is not None:
+            with tf.name_scope(self.rel_pos_y_bias.name):
+                self.rel_pos_y_bias.build([None, None, self.rel_2d_pos_bins])
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFLayoutLMv3MainLayer(keras.layers.Layer):
+    config_class = LayoutLMv3Config
+
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+
+        if config.text_embed:
+            self.embeddings = TFLayoutLMv3TextEmbeddings(config, name="embeddings")
+
+        if config.visual_embed:
+            self.patch_embed = TFLayoutLMv3PatchEmbeddings(config, name="patch_embed")
+            self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+            self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+
+            if config.has_relative_attention_bias or config.has_spatial_attention_bias:
+                image_size = config.input_size // config.patch_size
+                self.init_visual_bbox(image_size=(image_size, image_size))
+
+            self.norm = keras.layers.LayerNormalization(epsilon=1e-6, name="norm")
+
+        self.encoder = TFLayoutLMv3Encoder(config, name="encoder")
+
+    def build(self, input_shape=None):
+        if self.config.visual_embed:
+            image_size = self.config.input_size // self.config.patch_size
+            self.cls_token = self.add_weight(
+                shape=(1, 1, self.config.hidden_size),
+                initializer="zeros",
+                trainable=True,
+                dtype=tf.float32,
+                name="cls_token",
+            )
+            self.pos_embed = self.add_weight(
+                shape=(1, image_size * image_size + 1, self.config.hidden_size),
+                initializer="zeros",
+                trainable=True,
+                dtype=tf.float32,
+                name="pos_embed",
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "patch_embed", None) is not None:
+            with tf.name_scope(self.patch_embed.name):
+                self.patch_embed.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+        if getattr(self, "norm", None) is not None:
+            with tf.name_scope(self.norm.name):
+                self.norm.build([None, None, self.config.hidden_size])
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.word_embeddings.weight = value
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    def init_visual_bbox(self, image_size: tuple[int, int], max_len: int = 1000):
+        # We should not hardcode max_len to 1000, but it is done by the reference implementation,
+        # so we keep it for compatibility with the pretrained weights. The more correct approach
+        # would have been to pass on max_len=config.max_2d_position_embeddings - 1.
+        height, width = image_size
+
+        visual_bbox_x = tf.range(0, max_len * (width + 1), max_len) // width
+        visual_bbox_x = tf.expand_dims(visual_bbox_x, axis=0)
+        visual_bbox_x = tf.tile(visual_bbox_x, [width, 1])  # (width, width + 1)
+
+        visual_bbox_y = tf.range(0, max_len * (height + 1), max_len) // height
+        visual_bbox_y = tf.expand_dims(visual_bbox_y, axis=1)
+        visual_bbox_y = tf.tile(visual_bbox_y, [1, height])  # (height + 1, height)
+
+        visual_bbox = tf.stack(
+            [visual_bbox_x[:, :-1], visual_bbox_y[:-1], visual_bbox_x[:, 1:], visual_bbox_y[1:]],
+            axis=-1,
+        )
+        visual_bbox = tf.reshape(visual_bbox, [-1, 4])
+
+        cls_token_box = tf.constant([[1, 1, max_len - 1, max_len - 1]], dtype=tf.int32)
+        self.visual_bbox = tf.concat([cls_token_box, visual_bbox], axis=0)
+
+    def calculate_visual_bbox(self, batch_size: int, dtype: tf.DType):
+        visual_bbox = tf.expand_dims(self.visual_bbox, axis=0)
+        visual_bbox = tf.tile(visual_bbox, [batch_size, 1, 1])
+        visual_bbox = tf.cast(visual_bbox, dtype=dtype)
+        return visual_bbox
+
+    def embed_image(self, pixel_values: tf.Tensor) -> tf.Tensor:
+        embeddings = self.patch_embed(pixel_values)
+
+        # add [CLS] token
+        batch_size = tf.shape(embeddings)[0]
+        cls_tokens = tf.tile(self.cls_token, [batch_size, 1, 1])
+        embeddings = tf.concat([cls_tokens, embeddings], axis=1)
+
+        # add position embeddings
+        if getattr(self, "pos_embed", None) is not None:
+            embeddings += self.pos_embed
+
+        embeddings = self.norm(embeddings)
+        return embeddings
+
+    def get_extended_attention_mask(self, attention_mask: tf.Tensor) -> tf.Tensor:
+        # Adapted from transformers.modelling_utils.ModuleUtilsMixin.get_extended_attention_mask
+
+        n_dims = len(attention_mask.shape)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if n_dims == 3:
+            extended_attention_mask = tf.expand_dims(attention_mask, axis=1)
+        elif n_dims == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length].
+            # Make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length].
+            extended_attention_mask = tf.expand_dims(attention_mask, axis=1)  # (batch_size, 1, seq_length)
+            extended_attention_mask = tf.expand_dims(extended_attention_mask, axis=1)  # (batch_size, 1, 1, seq_length)
+        else:
+            raise ValueError(f"Wrong shape for attention_mask (shape {attention_mask.shape}).")
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, self.compute_dtype)
+        extended_attention_mask = (1.0 - extended_attention_mask) * LARGE_NEGATIVE
+
+        return extended_attention_mask
+
+    def get_head_mask(self, head_mask: tf.Tensor | None) -> tf.Tensor | list[tf.Tensor | None]:
+        if head_mask is None:
+            return [None] * self.config.num_hidden_layers
+
+        n_dims = tf.rank(head_mask)
+        if n_dims == 1:
+            # Gets a tensor with masks for each head (H).
+            head_mask = tf.expand_dims(head_mask, axis=0)  # 1, num_heads
+            head_mask = tf.expand_dims(head_mask, axis=0)  # 1, 1, num_heads
+            head_mask = tf.expand_dims(head_mask, axis=-1)  # 1, 1, num_heads, 1
+            head_mask = tf.expand_dims(head_mask, axis=-1)  # 1, 1, num_heads, 1, 1
+            head_mask = tf.tile(
+                head_mask, [self.config.num_hidden_layers, 1, 1, 1, 1]
+            )  # seq_length, 1, num_heads, 1, 1
+        elif n_dims == 2:
+            # Gets a tensor with masks for each layer (L) and head (H).
+            head_mask = tf.expand_dims(head_mask, axis=1)  # seq_length, 1, num_heads
+            head_mask = tf.expand_dims(head_mask, axis=-1)  # seq_length, 1, num_heads, 1
+            head_mask = tf.expand_dims(head_mask, axis=-1)  # seq_length, 1, num_heads, 1, 1
+        elif n_dims != 5:
+            raise ValueError(f"Wrong shape for head_mask (shape {head_mask.shape}).")
+        assert tf.rank(head_mask) == 5, f"Got head_mask rank of {tf.rank(head_mask)}, but require 5."
+        head_mask = tf.cast(head_mask, self.compute_dtype)
+        return head_mask
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor] | tuple[tf.Tensor, tf.Tensor] | tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        # This method can be called with a variety of modalities:
+        # 1. text + layout
+        # 2. text + layout + image
+        # 3. image
+        # The complexity of this method is mostly just due to handling of these different modalities.
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if input_ids is not None:
+            input_shape = tf.shape(input_ids)
+            batch_size = input_shape[0]
+            seq_length = input_shape[1]
+        elif inputs_embeds is not None:
+            input_shape = tf.shape(inputs_embeds)
+            batch_size = input_shape[0]
+            seq_length = input_shape[1]
+        elif pixel_values is not None:
+            batch_size = tf.shape(pixel_values)[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or pixel_values")
+
+        # Determine which integer dtype to use.
+        if input_ids is not None:
+            int_dtype = input_ids.dtype
+        elif bbox is not None:
+            int_dtype = bbox.dtype
+        elif attention_mask is not None:
+            int_dtype = attention_mask.dtype
+        elif token_type_ids is not None:
+            int_dtype = token_type_ids.dtype
+        else:
+            int_dtype = tf.int32
+
+        if input_ids is not None or inputs_embeds is not None:
+            if attention_mask is None:
+                attention_mask = tf.ones((batch_size, seq_length), dtype=int_dtype)
+            if token_type_ids is None:
+                token_type_ids = tf.zeros((batch_size, seq_length), dtype=int_dtype)
+            if bbox is None:
+                bbox = tf.zeros((batch_size, seq_length, 4), dtype=int_dtype)
+
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                bbox=bbox,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                inputs_embeds=inputs_embeds,
+                training=training,
+            )
+
+        final_bbox = None
+        final_position_ids = None
+        if pixel_values is not None:
+            # embed image
+            visual_embeddings = self.embed_image(pixel_values)
+
+            # calculate attention mask
+            visual_attention_mask = tf.ones((batch_size, tf.shape(visual_embeddings)[1]), dtype=int_dtype)
+            if attention_mask is None:
+                attention_mask = visual_attention_mask
+            else:
+                attention_mask = tf.concat([attention_mask, visual_attention_mask], axis=1)
+
+            # calculate bounding boxes
+            if self.config.has_spatial_attention_bias:
+                visual_bbox = self.calculate_visual_bbox(batch_size, int_dtype)
+                if bbox is None:
+                    final_bbox = visual_bbox
+                else:
+                    final_bbox = tf.concat([bbox, visual_bbox], axis=1)
+
+            # calculate position IDs
+            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+                visual_position_ids = tf.range(0, tf.shape(visual_embeddings)[1], dtype=int_dtype)
+                visual_position_ids = tf.expand_dims(visual_position_ids, axis=0)
+                visual_position_ids = tf.tile(visual_position_ids, [batch_size, 1])
+
+                if input_ids is not None or inputs_embeds is not None:
+                    position_ids = tf.expand_dims(tf.range(0, seq_length, dtype=int_dtype), axis=0)
+                    position_ids = tf.tile(position_ids, [batch_size, 1])
+                    final_position_ids = tf.concat([position_ids, visual_position_ids], axis=1)
+                else:
+                    final_position_ids = visual_position_ids
+
+            # calculate embeddings
+            if input_ids is None and inputs_embeds is None:
+                embedding_output = visual_embeddings
+            else:
+                embedding_output = tf.concat([embedding_output, visual_embeddings], axis=1)
+            embedding_output = self.LayerNorm(embedding_output)
+            embedding_output = self.dropout(embedding_output, training=training)
+
+        elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+            if self.config.has_relative_attention_bias:
+                position_ids = tf.expand_dims(tf.range(0, seq_length, dtype=int_dtype), axis=0)
+                position_ids = tf.tile(position_ids, [batch_size, 1])
+                final_position_ids = position_ids
+
+            if self.config.has_spatial_attention_bias:
+                final_bbox = bbox
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x seq_length x seq_length
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            bbox=final_bbox,
+            position_ids=final_position_ids,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LayoutLMv3Config
+    base_model_prefix = "layoutlmv3"
+
+    @property
+    def input_signature(self):
+        sig = super().input_signature
+        sig["bbox"] = tf.TensorSpec((None, None, 4), tf.int32, name="bbox")
+        return sig
+
+
+LAYOUTLMV3_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+LAYOUTLMV3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        bbox (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size,
+            config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height /
+            config.patch_size) * (width / config.patch_size))`.
+
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.",
+    LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor] | tuple[tf.Tensor, tf.Tensor] | tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFAutoModel
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
+
+        >>> outputs = model(**encoding)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        outputs = self.layoutlmv3(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlmv3", None) is not None:
+            with tf.name_scope(self.layoutlmv3.name):
+                self.layoutlmv3.build(None)
+
+
+class TFLayoutLMv3ClassificationHead(keras.layers.Layer):
+    """
+    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
+    """
+
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            config.hidden_size,
+            activation="tanh",
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
+        )
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(
+            classifier_dropout,
+            name="dropout",
+        )
+        self.out_proj = keras.layers.Dense(
+            config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="out_proj",
+        )
+        self.config = config
+
+    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+        outputs = self.dropout(inputs, training=training)
+        outputs = self.dense(outputs)
+        outputs = self.dropout(outputs, training=training)
+        outputs = self.out_proj(outputs)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
+    [CLS] token) e.g. for document image classification tasks such as the
+    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
+    """,
+    LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSequenceClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+        self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        bbox: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> (
+        TFSequenceClassifierOutput
+        | tuple[tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor, tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]
+    ):
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFAutoModelForSequenceClassification
+        >>> from datasets import load_dataset
+        >>> import tensorflow as tf
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
+        >>> sequence_label = tf.convert_to_tensor([1])
+
+        >>> outputs = model(**encoding, labels=sequence_label)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            bbox=bbox,
+            pixel_values=pixel_values,
+            training=training,
+        )
+        sequence_output = outputs[0][:, 0, :]
+        logits = self.classifier(sequence_output, training=training)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlmv3", None) is not None:
+            with tf.name_scope(self.layoutlmv3.name):
+                self.layoutlmv3.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
+    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
+    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
+    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
+    """,
+    LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+        if config.num_labels < 10:
+            self.classifier = keras.layers.Dense(
+                config.num_labels,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="classifier",
+            )
+        else:
+            self.classifier = TFLayoutLMv3ClassificationHead(config, name="classifier")
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        pixel_values: tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> (
+        TFTokenClassifierOutput
+        | tuple[tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor, tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]
+    ):
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFAutoModelForTokenClassification
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+        >>> word_labels = example["ner_tags"]
+
+        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="tf")
+
+        >>> outputs = model(**encoding)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            pixel_values=pixel_values,
+            training=training,
+        )
+        if input_ids is not None:
+            input_shape = tf.shape(input_ids)
+        else:
+            input_shape = tf.shape(inputs_embeds)[:-1]
+
+        seq_length = input_shape[1]
+        # only take the text part of the output representations
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlmv3", None) is not None:
+            with tf.name_scope(self.layoutlmv3.name):
+                self.layoutlmv3.build(None)
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
+    [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
+    compute `span start logits` and `span end logits`).
+    """,
+    LAYOUTLMV3_START_DOCSTRING,
+)
+class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAnsweringLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
+
+    def __init__(self, config: LayoutLMv3Config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.num_labels = config.num_labels
+
+        self.layoutlmv3 = TFLayoutLMv3MainLayer(config, name="layoutlmv3")
+        self.qa_outputs = TFLayoutLMv3ClassificationHead(config, name="qa_outputs")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        start_positions: tf.Tensor | None = None,
+        end_positions: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        bbox: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> (
+        TFQuestionAnsweringModelOutput
+        | tuple[tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor, tf.Tensor]
+        | tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]
+    ):
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFAutoModelForQuestionAnswering
+        >>> from datasets import load_dataset
+        >>> import tensorflow as tf
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+        >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
+
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> example = dataset[0]
+        >>> image = example["image"]
+        >>> question = "what's his name?"
+        >>> words = example["tokens"]
+        >>> boxes = example["bboxes"]
+
+        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="tf")
+        >>> start_positions = tf.convert_to_tensor([1])
+        >>> end_positions = tf.convert_to_tensor([3])
+
+        >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs.loss
+        >>> start_scores = outputs.start_logits
+        >>> end_scores = outputs.end_logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            bbox=bbox,
+            pixel_values=pixel_values,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output, training=training)
+        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
+        start_logits = tf.squeeze(input=start_logits, axis=-1)
+        end_logits = tf.squeeze(input=end_logits, axis=-1)
+
+        loss = None
+
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions, "end_position": end_positions}
+            loss = self.hf_compute_loss(labels, logits=(start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layoutlmv3", None) is not None:
+            with tf.name_scope(self.layoutlmv3.name):
+                self.layoutlmv3.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build(None)
+
+
+__all__ = [
+    "TFLayoutLMv3ForQuestionAnswering",
+    "TFLayoutLMv3ForSequenceClassification",
+    "TFLayoutLMv3ForTokenClassification",
+    "TFLayoutLMv3Model",
+    "TFLayoutLMv3PreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e34e3a4655a8147c538b0b5738e8770737e9750
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for LayoutLMv3.
+"""
+
+import warnings
+from typing import Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class LayoutLMv3Processor(ProcessorMixin):
+    r"""
+    Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
+    single processor.
+
+    [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
+
+    It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
+    [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
+    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
+    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
+
+    Args:
+        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
+            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
+            An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv3ImageProcessor"
+    tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
+        [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
+        together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
+        `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
+        with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
+        resized and normalized `pixel_values`.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        # verify input
+        if self.image_processor.apply_ocr and (boxes is not None):
+            raise ValueError(
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if self.image_processor.apply_ocr and (word_labels is not None):
+            raise ValueError(
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
+            )
+
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)
+
+        # second, apply the tokenizer
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
+            if isinstance(text, str):
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
+            text_pair = features["words"]
+
+        encoded_inputs = self.tokenizer(
+            text=text if text is not None else features["words"],
+            text_pair=text_pair if text_pair is not None else None,
+            boxes=boxes if boxes is not None else features["boxes"],
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        # add pixel values
+        images = features.pop("pixel_values")
+        if return_overflowing_tokens is True:
+            images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
+        encoded_inputs["pixel_values"] = images
+
+        return encoded_inputs
+
+    def get_overflowing_images(self, images, overflow_to_sample_mapping):
+        # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
+        images_with_overflow = []
+        for sample_idx in overflow_to_sample_mapping:
+            images_with_overflow.append(images[sample_idx])
+
+        if len(images_with_overflow) != len(overflow_to_sample_mapping):
+            raise ValueError(
+                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
+                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
+            )
+
+        return images_with_overflow
+
+    @property
+    def model_input_names(self):
+        return ["input_ids", "bbox", "attention_mask", "pixel_values"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["LayoutLMv3Processor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69fc57b1743c4b0349f33b3bc1f4a3d35120df3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -0,0 +1,1487 @@
+# coding=utf-8
+# Copyright The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for LayoutLMv3. Same as LayoutLMv2, but RoBERTa-like BPE tokenization instead of WordPiece."""
+
+import json
+import os
+from functools import lru_cache
+from typing import Optional, Union
+
+import regex as re
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PreTokenizedInput,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+
+LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
+                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
+                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
+                truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+
+@lru_cache
+# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+# Copied from transformers.models.roberta.tokenization_roberta.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class LayoutLMv3Tokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a LayoutLMv3 tokenizer. Based on [`RoBERTatokenizer`] (Byte Pair Encoding or BPE).
+    [`LayoutLMv3Tokenizer`] can be used to turn words, word-level bounding boxes and optional word labels to
+    token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`, and optional `labels` (for token
+    classification).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    [`LayoutLMv3Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It also turns the
+    word-level bounding boxes into token-level bounding boxes.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
+        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask", "bbox"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=True,
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[0, 0, 0, 0],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            **kwargs,
+        )
+
+    @property
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.encoder)
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab
+    def get_vocab(self):
+        vocab = dict(self.encoder).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        # If the text starts with a token that should not be split, no space is added before the text in any case.
+        # It's necessary to match the fast tokenization
+        if (
+            (is_split_into_words or add_prefix_space)
+            and (len(text) > 0 and not text[0].isspace())
+            and sum([text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder]) == 0
+        ):
+            text = " " + text
+        return (text, kwargs)
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.__call__
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.batch_encode_plus
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._batch_encode_plus
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        batch_outputs = self._batch_prepare_for_model(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._batch_prepare_for_model
+    def _batch_prepare_for_model(
+        self,
+        batch_text_or_text_pairs,
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens.
+
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+        """
+
+        batch_outputs = {}
+        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
+            batch_text_or_text_pair, boxes_example = example
+            outputs = self.prepare_for_model(
+                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
+                batch_text_or_text_pair[1] if is_pair else None,
+                boxes_example,
+                word_labels=word_labels[idx] if word_labels is not None else None,
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
+                return_attention_mask=False,  # we pad in batch afterward
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.encode
+    def encode(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> list[int]:
+        encoded_inputs = self.encode_plus(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.encode_plus
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
+        `__call__` should be used instead.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            boxes=boxes,
+            text_pair=text_pair,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._encode_plus
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        return self.prepare_for_model(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def prepare_for_model(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
+        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
+        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than `None` and
+        *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a
+        combination of arguments will raise an error.
+
+        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
+        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
+        labeled with -100, such that they will be ignored by the loss function.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`list[str]` or `list[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        tokens = []
+        pair_tokens = []
+        token_boxes = []
+        pair_token_boxes = []
+        labels = []
+
+        if text_pair is None:
+            if word_labels is None:
+                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
+                for word, box in zip(text, boxes):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+            else:
+                # CASE 2: token classification (training)
+                for word, box, label in zip(text, boxes, word_labels):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+                    if self.only_label_first_subword:
+                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
+                    else:
+                        labels.extend([label] * len(word_tokens))
+        else:
+            # CASE 3: document visual question answering (inference)
+            # text = question
+            # text_pair = words
+            tokens = self.tokenize(text)
+            token_boxes = [self.pad_token_box for _ in range(len(tokens))]
+
+            for word, box in zip(text_pair, boxes):
+                if len(word) < 1:  # skip empty words
+                    continue
+                word_tokens = self.tokenize(word)
+                pair_tokens.extend(word_tokens)
+                pair_token_boxes.extend([box] * len(word_tokens))
+
+        # Create ids + pair_ids
+        ids = self.convert_tokens_to_ids(tokens)
+        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
+
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+
+        # Compute the total size of the returned encodings
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            (
+                ids,
+                token_boxes,
+                pair_ids,
+                pair_token_boxes,
+                labels,
+                overflowing_tokens,
+                overflowing_token_boxes,
+                overflowing_labels,
+            ) = self.truncate_sequences(
+                ids,
+                token_boxes,
+                pair_ids=pair_ids,
+                pair_token_boxes=pair_token_boxes,
+                labels=labels,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = {}
+
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
+            encoded_inputs["overflowing_labels"] = overflowing_labels
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+            token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box]
+            if pair_token_boxes:
+                pair_token_boxes = [self.sep_token_box] + pair_token_boxes + [self.sep_token_box]
+            token_boxes = token_boxes + pair_token_boxes if pair else token_boxes
+            if labels:
+                labels = [self.pad_token_label] + labels + [self.pad_token_label]
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+            token_boxes = token_boxes + pair_token_boxes if pair else token_boxes
+
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["bbox"] = token_boxes
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        if labels:
+            encoded_inputs["labels"] = labels
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+
+        return batch_outputs
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.truncate_sequences
+    def truncate_sequences(
+        self,
+        ids: list[int],
+        token_boxes: list[list[int]],
+        pair_ids: Optional[list[int]] = None,
+        pair_token_boxes: Optional[list[list[int]]] = None,
+        labels: Optional[list[int]] = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
+        stride: int = 0,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """
+        Truncates a sequence pair in-place following the strategy.
+
+        Args:
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_tokens_to_ids` methods.
+            token_boxes (`List[List[int]]`):
+                Bounding boxes of the first sequence.
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_token_boxes (`List[List[int]]`, *optional*):
+                Bounding boxes of the second sequence.
+            labels (`List[int]`, *optional*):
+                Labels of the first sequence (for token classification tasks).
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
+                Number of tokens to remove using the truncation strategy.
+            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                The strategy to follow for truncation. Can be:
+
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
+                  batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
+                  than the model maximum admissible input size).
+            stride (`int`, *optional*, defaults to 0):
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
+                sequence returned. The value of this argument defines the number of additional tokens.
+
+        Returns:
+            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
+            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
+            of sequences (or a batch of pairs) is provided.
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
+
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
+            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
+        ):
+            if len(ids) > num_tokens_to_remove:
+                window_len = min(len(ids), stride + num_tokens_to_remove)
+                overflowing_tokens = ids[-window_len:]
+                overflowing_token_boxes = token_boxes[-window_len:]
+                overflowing_labels = labels[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+                token_boxes = token_boxes[:-num_tokens_to_remove]
+                labels = labels[:-num_tokens_to_remove]
+            else:
+                error_msg = (
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the first sequence has a length {len(ids)}. "
+                )
+                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
+                    error_msg = (
+                        error_msg + "Please select another truncation strategy than "
+                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
+                    )
+                logger.error(error_msg)
+        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            logger.warning(
+                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
+                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
+                "truncation strategy. So the returned list will always be empty even if some "
+                "tokens have been removed."
+            )
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    ids = ids[:-1]
+                    token_boxes = token_boxes[:-1]
+                    labels = labels[:-1]
+                else:
+                    pair_ids = pair_ids[:-1]
+                    pair_token_boxes = pair_token_boxes[:-1]
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
+            if len(pair_ids) > num_tokens_to_remove:
+                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+                overflowing_tokens = pair_ids[-window_len:]
+                overflowing_token_boxes = pair_token_boxes[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the second sequence has a length {len(pair_ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    "for instance 'longest_first' or 'only_first'."
+                )
+
+        return (
+            ids,
+            token_boxes,
+            pair_ids,
+            pair_token_boxes,
+            labels,
+            overflowing_tokens,
+            overflowing_token_boxes,
+            overflowing_labels,
+        )
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._pad
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+
+__all__ = ["LayoutLMv3Tokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0407638595d8a127cef02e5bf687b34663e9bfa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -0,0 +1,848 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fast tokenization class for LayoutLMv3. It overwrites 2 methods of the slow tokenizer class, namely _batch_encode_plus
+and _encode_plus, in which the Rust tokenizer is used.
+"""
+
+import json
+from typing import Optional, Union
+
+from tokenizers import processors
+
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TensorType,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import add_end_docstrings, logging
+from .tokenization_layoutlmv3 import (
+    LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING,
+    LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
+    LayoutLMv3Tokenizer,
+)
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" LayoutLMv3 tokenizer (backed by HuggingFace's *tokenizers* library). Based on BPE.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether the post processing step should trim offsets to avoid including whitespaces.
+        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = LayoutLMv3Tokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=True,
+        trim_offsets=True,
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[0, 0, 0, 0],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            **kwargs,
+        )
+
+        tokenizer_component = "post_processor"
+        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
+        if tokenizer_component_instance:
+            state = json.loads(tokenizer_component_instance.__getstate__())
+
+            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
+            if "sep" in state:
+                state["sep"] = tuple(state["sep"])
+            if "cls" in state:
+                state["cls"] = tuple(state["cls"])
+
+            changes_to_apply = False
+
+            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+                state["add_prefix_space"] = add_prefix_space
+                changes_to_apply = True
+
+            if state.get("trim_offsets", trim_offsets) != trim_offsets:
+                state["trim_offsets"] = trim_offsets
+                changes_to_apply = True
+
+            if changes_to_apply:
+                component_class = getattr(processors, state.pop("type"))
+                new_value = component_class(**state)
+                setattr(self.backend_tokenizer, tokenizer_component, new_value)
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.__call__
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must be of type `List[str]` (single pretokenized example), "
+                    "or `List[List[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.batch_encode_plus
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.tokenize
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
+        )
+
+        return encodings[0].tokens
+
+    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.encode_plus
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
+        `__call__` should be used instead.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            boxes=boxes,
+            text_pair=text_pair,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
+
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+        )
+
+        if is_pair:
+            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv3 always expects pretokenized inputs
+        )
+
+        # Convert encoding to dict
+        # `Tokens` has type: tuple[
+        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
+        #                       list[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
+                return_length=return_length,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.cls_token_id:
+                        token_boxes_example.append(self.cls_token_box)
+                    elif id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        raise ValueError("Id not recognized")
+            token_boxes.append(token_boxes_example)
+
+        sanitized_tokens["bbox"] = token_boxes
+
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                previous_token_empty = False
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0 and not previous_token_empty:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                            if offset == (0, 0):
+                                previous_token_empty = True
+                            else:
+                                previous_token_empty = False
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._encode_plus
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
+            )
+
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
+
+        return batched_output
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._pad
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Args:
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not:
+        make use of token type ids, therefore a list of zeros is returned.
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+
+__all__ = ["LayoutLMv3TokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/__init__.py b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5a662e6afd2c83768e9731d896da83b0b89193
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .processing_layoutxlm import *
+    from .tokenization_layoutxlm import *
+    from .tokenization_layoutxlm_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83307598f2c965f0ca7bf0d3b203df90144fc6a4
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/processing_layoutxlm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/processing_layoutxlm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ac99358af591dc50ddc6de52e5524292bd441b0
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/processing_layoutxlm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/tokenization_layoutxlm.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/tokenization_layoutxlm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3103b532c69b442d569a9e1e0655fbec2ff2658d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/tokenization_layoutxlm.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/tokenization_layoutxlm_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/tokenization_layoutxlm_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3f60365e0bc42510ea8800c700a6eed1860917d
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/layoutxlm/__pycache__/tokenization_layoutxlm_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py b/phivenv/Lib/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d9189397eb986a999dbb65b52a910d34bcc8c8
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for LayoutXLM.
+"""
+
+import warnings
+from typing import Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class LayoutXLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
+    processor.
+
+    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
+
+    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
+    [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
+    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
+    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
+
+    Args:
+        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
+            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
+            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv2ImageProcessor"
+    tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
+        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
+        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
+        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
+        arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        # verify input
+        if self.image_processor.apply_ocr and (boxes is not None):
+            raise ValueError(
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if self.image_processor.apply_ocr and (word_labels is not None):
+            raise ValueError(
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if return_overflowing_tokens is True and return_offsets_mapping is False:
+            raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
+
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)
+
+        # second, apply the tokenizer
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
+            if isinstance(text, str):
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
+            text_pair = features["words"]
+
+        encoded_inputs = self.tokenizer(
+            text=text if text is not None else features["words"],
+            text_pair=text_pair if text_pair is not None else None,
+            boxes=boxes if boxes is not None else features["boxes"],
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        # add pixel values
+        images = features.pop("pixel_values")
+        if return_overflowing_tokens is True:
+            images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
+        encoded_inputs["image"] = images
+
+        return encoded_inputs
+
+    def get_overflowing_images(self, images, overflow_to_sample_mapping):
+        # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
+        images_with_overflow = []
+        for sample_idx in overflow_to_sample_mapping:
+            images_with_overflow.append(images[sample_idx])
+
+        if len(images_with_overflow) != len(overflow_to_sample_mapping):
+            raise ValueError(
+                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
+                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
+            )
+
+        return images_with_overflow
+
+    @property
+    def model_input_names(self):
+        return ["input_ids", "bbox", "attention_mask", "image"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["LayoutXLMProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py b/phivenv/Lib/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dbe3c21a6365b5a20988626cc57520ea37d5fc3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -0,0 +1,1191 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for LayoutXLM model."""
+
+import os
+from shutil import copyfile
+from typing import Any, Optional, Union
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PreTokenizedInput,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
+from ...utils.import_utils import requires
+from ..xlm_roberta.tokenization_xlm_roberta import (
+    SPIECE_UNDERLINE,
+    VOCAB_FILES_NAMES,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+
+              [What are input IDs?](../glossary#input-ids)
+
+            - **bbox** -- List of bounding boxes to be fed to a model.
+
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
+              if *"token_type_ids"* is in `self.model_input_names`).
+
+              [What are token type IDs?](../glossary#token-type-ids)
+
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+
+              [What are attention masks?](../glossary#attention-mask)
+
+            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`).
+"""
+
+
+@requires(backends=("sentencepiece",))
+class LayoutXLMTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[1000, 1000, 1000, 1000],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> list[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`list[list[int]]`, `list[list[list[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`list[int]`, `list[list[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        batch_outputs = self._batch_prepare_for_model(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        batch_text_or_text_pairs,
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens
+
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+        """
+
+        batch_outputs = {}
+        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
+            batch_text_or_text_pair, boxes_example = example
+            outputs = self.prepare_for_model(
+                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
+                batch_text_or_text_pair[1] if is_pair else None,
+                boxes_example,
+                word_labels=word_labels[idx] if word_labels is not None else None,
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
+                return_attention_mask=False,  # we pad in batch afterward
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        return self.prepare_for_model(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def prepare_for_model(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
+        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
+        (with user defined stride) for overflowing tokens.
+
+        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
+        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
+        labeled with -100, such that they will be ignored by the loss function.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`list[str]` or `list[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        tokens = []
+        pair_tokens = []
+        token_boxes = []
+        pair_token_boxes = []
+        labels = []
+
+        if text_pair is None:
+            if word_labels is None:
+                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
+                for word, box in zip(text, boxes):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+            else:
+                # CASE 2: token classification (training)
+                for word, box, label in zip(text, boxes, word_labels):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+                    if self.only_label_first_subword:
+                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
+                    else:
+                        labels.extend([label] * len(word_tokens))
+        else:
+            # CASE 3: document visual question answering (inference)
+            # text = question
+            # text_pair = words
+            tokens = self.tokenize(text)
+            token_boxes = [self.pad_token_box for _ in range(len(tokens))] + [self.sep_token_box]
+
+            for word, box in zip(text_pair, boxes):
+                if len(word) < 1:  # skip empty words
+                    continue
+                word_tokens = self.tokenize(word)
+                pair_tokens.extend(word_tokens)
+                pair_token_boxes.extend([box] * len(word_tokens))
+
+        # Create ids + pair_ids
+        ids = self.convert_tokens_to_ids(tokens)
+        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
+
+        # Compute the total size of the returned encodings
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            (
+                ids,
+                token_boxes,
+                pair_ids,
+                pair_token_boxes,
+                labels,
+                overflowing_tokens,
+                overflowing_token_boxes,
+                overflowing_labels,
+            ) = self.truncate_sequences(
+                ids,
+                token_boxes,
+                pair_ids=pair_ids,
+                pair_token_boxes=pair_token_boxes,
+                labels=labels,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = {}
+
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
+            encoded_inputs["overflowing_labels"] = overflowing_labels
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+            token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box]
+            if pair_token_boxes:
+                pair_token_boxes = pair_token_boxes + [self.sep_token_box]
+            if labels:
+                labels = [self.pad_token_label] + labels + [self.pad_token_label]
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["bbox"] = token_boxes + pair_token_boxes
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        if labels:
+            encoded_inputs["labels"] = labels
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+
+        return batch_outputs
+
+    def truncate_sequences(
+        self,
+        ids: list[int],
+        token_boxes: list[list[int]],
+        pair_ids: Optional[list[int]] = None,
+        pair_token_boxes: Optional[list[list[int]]] = None,
+        labels: Optional[list[int]] = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
+        stride: int = 0,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """
+        Truncates a sequence pair in-place following the strategy.
+
+        Args:
+            ids (`list[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_tokens_to_ids` methods.
+            token_boxes (`list[list[int]]`):
+                Bounding boxes of the first sequence.
+            pair_ids (`list[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_token_boxes (`list[list[int]]`, *optional*):
+                Bounding boxes of the second sequence.
+            labels (`list[int]`, *optional*):
+                Labels of the first sequence (for token classification tasks).
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
+                Number of tokens to remove using the truncation strategy.
+            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                The strategy to follow for truncation. Can be:
+
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
+                  batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
+                  than the model maximum admissible input size).
+            stride (`int`, *optional*, defaults to 0):
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
+                sequence returned. The value of this argument defines the number of additional tokens.
+
+        Returns:
+            `tuple[list[int], list[int], list[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
+            overflowing tokens.
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
+
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    if not overflowing_tokens:
+                        window_len = min(len(ids), stride + 1)
+                    else:
+                        window_len = 1
+                    overflowing_tokens.extend(ids[-window_len:])
+                    overflowing_token_boxes.extend(token_boxes[-window_len:])
+                    overflowing_labels.extend(labels[-window_len:])
+                    ids = ids[:-1]
+                    token_boxes = token_boxes[:-1]
+                    labels = labels[:-1]
+                else:
+                    if not overflowing_tokens:
+                        window_len = min(len(pair_ids), stride + 1)
+                    else:
+                        window_len = 1
+                    overflowing_tokens.extend(pair_ids[-window_len:])
+                    overflowing_token_boxes.extend(pair_token_boxes[-window_len:])
+                    pair_ids = pair_ids[:-1]
+                    pair_token_boxes = pair_token_boxes[:-1]
+        elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
+            if len(ids) > num_tokens_to_remove:
+                window_len = min(len(ids), stride + num_tokens_to_remove)
+                overflowing_tokens = ids[-window_len:]
+                overflowing_token_boxes = token_boxes[-window_len:]
+                overflowing_labels = labels[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+                token_boxes = token_boxes[:-num_tokens_to_remove]
+                labels = labels[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the first sequence has a length {len(ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    "for instance 'longest_first' or 'only_second'."
+                )
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
+            if len(pair_ids) > num_tokens_to_remove:
+                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+                overflowing_tokens = pair_ids[-window_len:]
+                overflowing_token_boxes = pair_token_boxes[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the second sequence has a length {len(pair_ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    "for instance 'longest_first' or 'only_first'."
+                )
+
+        return (
+            ids,
+            token_boxes,
+            pair_ids,
+            pair_token_boxes,
+            labels,
+            overflowing_tokens,
+            overflowing_token_boxes,
+            overflowing_labels,
+        )
+
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+
+__all__ = ["LayoutXLMTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/phivenv/Lib/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..6710c6c8cb66ed08da2df391c28ad1db2e6cf81d
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -0,0 +1,815 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for LayoutXLM model."""
+
+import os
+from shutil import copyfile
+from typing import Optional, Union
+
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PreTokenizedInput,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
+from ..xlm_roberta.tokenization_xlm_roberta_fast import (
+    VOCAB_FILES_NAMES,
+)
+
+
+if is_sentencepiece_available():
+    from .tokenization_layoutxlm import LayoutXLMTokenizer
+else:
+    LayoutXLMTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+
+              [What are input IDs?](../glossary#input-ids)
+
+            - **bbox** -- List of bounding boxes to be fed to a model.
+
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
+              if *"token_type_ids"* is in `self.model_input_names`).
+
+              [What are token type IDs?](../glossary#token-type-ids)
+
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+
+              [What are attention masks?](../glossary#attention-mask)
+
+            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`).
+"""
+
+
+class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+        additional_special_tokens (`list[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = LayoutXLMTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[1000, 1000, 1000, 1000],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`list[list[int]]`, `list[list[list[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`list[int]`, `list[list[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+
+        self._tokenizer.encode_special_tokens = kwargs.pop(
+            "split_special_tokens", self._tokenizer.encode_special_tokens
+        )
+
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
+        )
+
+        return encodings[0].tokens
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
+
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+        )
+
+        if is_pair:
+            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
+        )
+
+        # Convert encoding to dict
+        # `Tokens` has type: tuple[
+        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
+        #                       list[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
+                return_length=return_length,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.cls_token_id:
+                        token_boxes_example.append(self.cls_token_box)
+                    elif id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        raise ValueError("Id not recognized")
+            token_boxes.append(token_boxes_example)
+
+        sanitized_tokens["bbox"] = token_boxes
+
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
+            )
+
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
+
+        return batched_output
+
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["LayoutXLMTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/led/__init__.py b/phivenv/Lib/site-packages/transformers/models/led/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..786ebd36d7b8cc3c7508d5282889a5dd941c41fe
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/led/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_led import *
+    from .modeling_led import *
+    from .modeling_tf_led import *
+    from .tokenization_led import *
+    from .tokenization_led_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/led/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..340f706dbab431960652c7fd4aae1d6abe9e9fce
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fec5b9ca0d90533df77b86db2b8052edb6d21cc3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/led/__pycache__/modeling_led.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/modeling_led.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f55377783376a66baa2382bd1f5c51eb10a547f6
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/modeling_led.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/led/__pycache__/modeling_tf_led.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/modeling_tf_led.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63b9414d39d6172b53df6b1090aaad3995f23940
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/modeling_tf_led.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..870c43e4b525496ec2c2f6becba2ee1756fa7da9
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..620e015bf951c5fae4a04672cf33d460ca059c63
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/led/configuration_led.py b/phivenv/Lib/site-packages/transformers/models/led/configuration_led.py
new file mode 100644
index 0000000000000000000000000000000000000000..57809df4aa881f9a4622ce0d1b110e93ff654c4f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/led/configuration_led.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LED model configuration"""
+
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class LEDConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LEDModel`]. It is used to instantiate an LED
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LED
+    [allenai/led-base-16384](https://huggingface.co/allenai/led-base-16384) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the LED model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LEDModel`] or [`TFLEDModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_encoder_position_embeddings (`int`, *optional*, defaults to 16384):
+            The maximum sequence length that the encoder might ever be used with.
+        max_decoder_position_embeddings (`int`, *optional*, defaults to 16384):
+            The maximum sequence length that the decoder might ever be used with.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
+    Example:
+
+    ```python
+    >>> from transformers import LEDModel, LEDConfig
+
+    >>> # Initializing a LED allenai/led-base-16384 style configuration
+    >>> configuration = LEDConfig()
+
+    >>> # Initializing a model from the allenai/led-base-16384 style configuration
+    >>> model = LEDModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "led"
+    attribute_map = {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+        "attention_probs_dropout_prob": "attention_dropout",
+        "initializer_range": "init_std",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_encoder_position_embeddings=16384,
+        max_decoder_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        classifier_dropout=0.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        attention_window: Union[list[int], int] = 512,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_encoder_position_embeddings = max_encoder_position_embeddings
+        self.max_decoder_position_embeddings = max_decoder_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.attention_window = attention_window
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["LEDConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/led/modeling_led.py b/phivenv/Lib/site-packages/transformers/models/led/modeling_led.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9944e53f5d2c340e1a3b3f2b5283b2d4ddb9e3c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/led/modeling_led.py
@@ -0,0 +1,2533 @@
+# coding=utf-8
+# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LED model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_led import LEDConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _prepare_4d_attention_mask_inverted(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+    expanded_attention_mask = inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+    # make sure that global_attn_mask is positive
+    expanded_attention_mask = expanded_attention_mask * inverted_mask
+
+    return expanded_attention_mask
+
+
+class LEDLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        super().__init__(num_embeddings, embedding_dim)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+# Copied from transformers.models.longformer.modeling_longformer.LongformerSelfAttention with Longformer->LEDEncoder
+class LEDEncoderSelfAttention(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_heads = config.num_attention_heads
+        self.head_dim = int(config.hidden_size / config.num_attention_heads)
+        self.embed_dim = config.hidden_size
+
+        self.query = nn.Linear(config.hidden_size, self.embed_dim)
+        self.key = nn.Linear(config.hidden_size, self.embed_dim)
+        self.value = nn.Linear(config.hidden_size, self.embed_dim)
+
+        # separate projection layers for tokens with global attention
+        self.query_global = nn.Linear(config.hidden_size, self.embed_dim)
+        self.key_global = nn.Linear(config.hidden_size, self.embed_dim)
+        self.value_global = nn.Linear(config.hidden_size, self.embed_dim)
+
+        self.dropout = config.attention_probs_dropout_prob
+
+        self.layer_id = layer_id
+        attention_window = config.attention_window[self.layer_id]
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )
+
+        self.one_sided_attn_window_size = attention_window // 2
+
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_head_mask=None,
+        is_index_masked=None,
+        is_index_global_attn=None,
+        is_global_attn=None,
+        output_attentions=False,
+    ):
+        """
+        [`LEDEncoderSelfAttention`] expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
+        *attention_window* happens in [`LEDEncoderModel.forward`] to avoid redoing the padding on each layer.
+
+        The *attention_mask* is changed in [`LEDEncoderModel.forward`] from 0, 1, 2 to:
+
+            - -10000: no attention
+            - 0: local attention
+            - +10000: global attention
+        """
+        hidden_states = hidden_states.transpose(0, 1)
+
+        # project hidden states
+        query_vectors = self.query(hidden_states)
+        key_vectors = self.key(hidden_states)
+        value_vectors = self.value(hidden_states)
+
+        seq_len, batch_size, embed_dim = hidden_states.size()
+        assert embed_dim == self.embed_dim, (
+            f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        )
+
+        # normalize query
+        query_vectors /= math.sqrt(self.head_dim)
+
+        query_vectors = query_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+        key_vectors = key_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+
+        attn_scores = self._sliding_chunks_query_key_matmul(
+            query_vectors, key_vectors, self.one_sided_attn_window_size
+        )
+
+        # values to pad for attention probs
+        remove_from_windowed_attention_mask = (attention_mask != 0)[:, :, None, None]
+
+        # cast to fp32/fp16 then replace 1's with -inf
+        float_mask = remove_from_windowed_attention_mask.type_as(query_vectors).masked_fill(
+            remove_from_windowed_attention_mask, torch.finfo(query_vectors.dtype).min
+        )
+        # diagonal mask with zeros everywhere and -inf inplace of padding
+        diagonal_mask = self._sliding_chunks_query_key_matmul(
+            float_mask.new_ones(size=float_mask.size()), float_mask, self.one_sided_attn_window_size
+        )
+
+        # pad local attention probs
+        attn_scores += diagonal_mask
+
+        assert list(attn_scores.size()) == [
+            batch_size,
+            seq_len,
+            self.num_heads,
+            self.one_sided_attn_window_size * 2 + 1,
+        ], (
+            f"local_attn_probs should be of size ({batch_size}, {seq_len}, {self.num_heads},"
+            f" {self.one_sided_attn_window_size * 2 + 1}), but is of size {attn_scores.size()}"
+        )
+
+        # compute local attention probs from global attention keys and contact over window dim
+        if is_global_attn:
+            # compute global attn indices required through out forward fn
+            (
+                max_num_global_attn_indices,
+                is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero,
+            ) = self._get_global_attn_indices(is_index_global_attn)
+            # calculate global attn probs from global key
+
+            global_key_attn_scores = self._concat_with_global_key_attn_probs(
+                query_vectors=query_vectors,
+                key_vectors=key_vectors,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
+            )
+            # concat to local_attn_probs
+            # (batch_size, seq_len, num_heads, extra attention count + 2*window+1)
+            attn_scores = torch.cat((global_key_attn_scores, attn_scores), dim=-1)
+
+            # free memory
+            del global_key_attn_scores
+
+        attn_probs = nn.functional.softmax(
+            attn_scores, dim=-1, dtype=torch.float32
+        )  # use fp32 for numerical stability
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
+            attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
+
+        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
+        attn_probs = torch.masked_fill(attn_probs, is_index_masked[:, :, None, None], 0.0)
+        attn_probs = attn_probs.type_as(attn_scores)
+
+        # free memory
+        del attn_scores
+
+        # apply dropout
+        attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training)
+
+        value_vectors = value_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
+
+        # compute local attention output with global attention value and add
+        if is_global_attn:
+            # compute sum of global and local attn
+            attn_output = self._compute_attn_output_with_global_indices(
+                value_vectors=value_vectors,
+                attn_probs=attn_probs,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+            )
+        else:
+            # compute local attn only
+            attn_output = self._sliding_chunks_matmul_attn_probs_value(
+                attn_probs, value_vectors, self.one_sided_attn_window_size
+            )
+
+        assert attn_output.size() == (batch_size, seq_len, self.num_heads, self.head_dim), "Unexpected size"
+        attn_output = attn_output.transpose(0, 1).reshape(seq_len, batch_size, embed_dim).contiguous()
+
+        # compute value for global attention and overwrite to attention output
+        # TODO: remove the redundant computation
+        if is_global_attn:
+            global_attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
+                hidden_states=hidden_states,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                layer_head_mask=layer_head_mask,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
+                is_index_masked=is_index_masked,
+            )
+
+            # get only non zero global attn output
+            nonzero_global_attn_output = global_attn_output[
+                is_local_index_global_attn_nonzero[0], :, is_local_index_global_attn_nonzero[1]
+            ]
+
+            # overwrite values with global attention
+            attn_output[is_index_global_attn_nonzero[::-1]] = nonzero_global_attn_output.view(
+                len(is_local_index_global_attn_nonzero[0]), -1
+            )
+            # The attention weights for tokens with global attention are
+            # just filler values, they were never used to compute the output.
+            # Fill with 0 now, the correct values are in 'global_attn_probs'.
+            attn_probs[is_index_global_attn_nonzero] = 0
+
+        outputs = (attn_output.transpose(0, 1),)
+
+        if output_attentions:
+            outputs += (attn_probs,)
+
+        return outputs + (global_attn_probs,) if (is_global_attn and output_attentions) else outputs
+
+    @staticmethod
+    def _pad_and_transpose_last_two_dims(hidden_states_padded, padding):
+        """pads rows and then flips rows and columns"""
+        hidden_states_padded = nn.functional.pad(
+            hidden_states_padded, padding
+        )  # padding value is not important because it will be overwritten
+        hidden_states_padded = hidden_states_padded.view(
+            *hidden_states_padded.size()[:-2], hidden_states_padded.size(-1), hidden_states_padded.size(-2)
+        )
+        return hidden_states_padded
+
+    @staticmethod
+    def _pad_and_diagonalize(chunked_hidden_states):
+        """
+        shift every row 1 step right, converting columns into diagonals.
+
+        Example:
+
+        ```python
+        chunked_hidden_states: [
+            0.4983,
+            2.6918,
+            -0.0071,
+            1.0492,
+            -1.8348,
+            0.7672,
+            0.2986,
+            0.0285,
+            -0.7584,
+            0.4206,
+            -0.0405,
+            0.1599,
+            2.0514,
+            -1.1600,
+            0.5372,
+            0.2629,
+        ]
+        window_overlap = num_rows = 4
+        ```
+
+                     (pad & diagonalize) => [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
+                       0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, 0.0000, -0.7584, 0.4206,
+                       -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
+        """
+        total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
+        chunked_hidden_states = nn.functional.pad(
+            chunked_hidden_states, (0, window_overlap + 1)
+        )  # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
+        chunked_hidden_states = chunked_hidden_states.view(
+            total_num_heads, num_chunks, -1
+        )  # total_num_heads x num_chunks x window_overlap*window_overlap+window_overlap
+        chunked_hidden_states = chunked_hidden_states[
+            :, :, :-window_overlap
+        ]  # total_num_heads x num_chunks x window_overlap*window_overlap
+        chunked_hidden_states = chunked_hidden_states.view(
+            total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim
+        )
+        chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
+        return chunked_hidden_states
+
+    @staticmethod
+    def _chunk(hidden_states, window_overlap, onnx_export: bool = False):
+        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
+        if not onnx_export:
+            # non-overlapping chunks of size = 2w
+            hidden_states = hidden_states.view(
+                hidden_states.size(0),
+                torch.div(hidden_states.size(1), (window_overlap * 2), rounding_mode="trunc"),
+                window_overlap * 2,
+                hidden_states.size(2),
+            )
+            # use `as_strided` to make the chunks overlap with an overlap size = window_overlap
+            chunk_size = list(hidden_states.size())
+            chunk_size[1] = chunk_size[1] * 2 - 1
+
+            chunk_stride = list(hidden_states.stride())
+            chunk_stride[1] = chunk_stride[1] // 2
+            return hidden_states.as_strided(size=chunk_size, stride=chunk_stride)
+
+        # When exporting to ONNX, use this separate logic
+        # have to use slow implementation since as_strided, unfold and 2d-tensor indexing aren't supported (yet) in ONNX export
+
+        # TODO replace this with
+        # > return hidden_states.unfold(dimension=1, size=window_overlap * 2, step=window_overlap).transpose(2, 3)
+        # once `unfold` is supported
+        # the case hidden_states.size(1) == window_overlap * 2 can also simply return hidden_states.unsqueeze(1), but that's control flow
+
+        chunk_size = [
+            hidden_states.size(0),
+            torch.div(hidden_states.size(1), window_overlap, rounding_mode="trunc") - 1,
+            window_overlap * 2,
+            hidden_states.size(2),
+        ]
+
+        overlapping_chunks = torch.empty(chunk_size, device=hidden_states.device)
+        for chunk in range(chunk_size[1]):
+            overlapping_chunks[:, chunk, :, :] = hidden_states[
+                :, chunk * window_overlap : chunk * window_overlap + 2 * window_overlap, :
+            ]
+        return overlapping_chunks
+
+    @staticmethod
+    def _mask_invalid_locations(input_tensor, affected_seq_len) -> torch.Tensor:
+        beginning_mask_2d = input_tensor.new_ones(affected_seq_len, affected_seq_len + 1).tril().flip(dims=[0])
+        beginning_mask = beginning_mask_2d[None, :, None, :]
+        ending_mask = beginning_mask.flip(dims=(1, 3))
+        beginning_input = input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1]
+        beginning_mask = beginning_mask.expand(beginning_input.size())
+        input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1] = torch.full_like(
+            beginning_input, -float("inf")
+        ).where(beginning_mask.bool(), beginning_input)
+        ending_input = input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :]
+        ending_mask = ending_mask.expand(ending_input.size())
+        input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :] = torch.full_like(
+            ending_input, -float("inf")
+        ).where(ending_mask.bool(), ending_input)
+
+    def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tensor, window_overlap: int):
+        """
+        Matrix multiplication of query and key tensors using with a sliding window attention pattern. This
+        implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained LEDEncoder) with an
+        overlap of size window_overlap
+        """
+        batch_size, seq_len, num_heads, head_dim = query.size()
+        assert seq_len % (window_overlap * 2) == 0, (
+            f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        )
+        assert query.size() == key.size()
+
+        chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
+
+        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size window_overlap * 2
+        query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
+        key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
+
+        query = self._chunk(query, window_overlap, getattr(self.config, "onnx_export", False))
+        key = self._chunk(key, window_overlap, getattr(self.config, "onnx_export", False))
+
+        # matrix multiplication
+        # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
+        # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
+        # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap
+        diagonal_chunked_attention_scores = torch.einsum("bcxd,bcyd->bcxy", (query, key))  # multiply
+
+        # convert diagonals into columns
+        diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(
+            diagonal_chunked_attention_scores, padding=(0, 0, 0, 1)
+        )
+
+        # allocate space for the overall attention matrix where the chunks are combined. The last dimension
+        # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns are the window_overlap lower triangles (attention from a word to
+        # window_overlap previous words). The following column is attention score from each word to itself, then
+        # followed by window_overlap columns for the upper triangle.
+
+        diagonal_attention_scores = diagonal_chunked_attention_scores.new_zeros(
+            (batch_size * num_heads, chunks_count + 1, window_overlap, window_overlap * 2 + 1)
+        )
+
+        # copy parts from diagonal_chunked_attention_scores into the combined matrix of attentions
+        # - copying the main diagonal and the upper triangle
+        diagonal_attention_scores[:, :-1, :, window_overlap:] = diagonal_chunked_attention_scores[
+            :, :, :window_overlap, : window_overlap + 1
+        ]
+        diagonal_attention_scores[:, -1, :, window_overlap:] = diagonal_chunked_attention_scores[
+            :, -1, window_overlap:, : window_overlap + 1
+        ]
+        # - copying the lower triangle
+        diagonal_attention_scores[:, 1:, :, :window_overlap] = diagonal_chunked_attention_scores[
+            :, :, -(window_overlap + 1) : -1, window_overlap + 1 :
+        ]
+
+        diagonal_attention_scores[:, 0, 1:window_overlap, 1:window_overlap] = diagonal_chunked_attention_scores[
+            :, 0, : window_overlap - 1, 1 - window_overlap :
+        ]
+
+        # separate batch_size and num_heads dimensions again
+        diagonal_attention_scores = diagonal_attention_scores.view(
+            batch_size, num_heads, seq_len, 2 * window_overlap + 1
+        ).transpose(2, 1)
+
+        self._mask_invalid_locations(diagonal_attention_scores, window_overlap)
+        return diagonal_attention_scores
+
+    def _sliding_chunks_matmul_attn_probs_value(
+        self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
+    ):
+        """
+        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
+        same shape as `attn_probs`
+        """
+        batch_size, seq_len, num_heads, head_dim = value.size()
+
+        assert seq_len % (window_overlap * 2) == 0
+        assert attn_probs.size()[:3] == value.size()[:3]
+        assert attn_probs.size(3) == 2 * window_overlap + 1
+        chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
+        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size 2 window overlap
+
+        chunked_attn_probs = attn_probs.transpose(1, 2).reshape(
+            batch_size * num_heads,
+            torch.div(seq_len, window_overlap, rounding_mode="trunc"),
+            window_overlap,
+            2 * window_overlap + 1,
+        )
+
+        # group batch_size and num_heads dimensions into one
+        value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
+
+        # pad seq_len with w at the beginning of the sequence and another window overlap at the end
+        padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1)
+
+        # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
+        chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim)
+        chunked_value_stride = padded_value.stride()
+        chunked_value_stride = (
+            chunked_value_stride[0],
+            window_overlap * chunked_value_stride[1],
+            chunked_value_stride[1],
+            chunked_value_stride[2],
+        )
+        chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride)
+
+        chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
+
+        context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value))
+        return context.view(batch_size, num_heads, seq_len, head_dim).transpose(1, 2)
+
+    @staticmethod
+    def _get_global_attn_indices(is_index_global_attn):
+        """compute global attn indices required throughout forward pass"""
+        # helper variable
+        num_global_attn_indices = is_index_global_attn.long().sum(dim=1)
+
+        # max number of global attn indices in batch
+        max_num_global_attn_indices = num_global_attn_indices.max()
+
+        # indices of global attn
+        is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True)
+
+        # helper variable
+        is_local_index_global_attn = torch.arange(
+            max_num_global_attn_indices, device=is_index_global_attn.device
+        ) < num_global_attn_indices.unsqueeze(dim=-1)
+
+        # location of the non-padding values within global attention indices
+        is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True)
+
+        # location of the padding values within global attention indices
+        is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True)
+        return (
+            max_num_global_attn_indices,
+            is_index_global_attn_nonzero,
+            is_local_index_global_attn_nonzero,
+            is_local_index_no_global_attn_nonzero,
+        )
+
+    def _concat_with_global_key_attn_probs(
+        self,
+        key_vectors,
+        query_vectors,
+        max_num_global_attn_indices,
+        is_index_global_attn_nonzero,
+        is_local_index_global_attn_nonzero,
+        is_local_index_no_global_attn_nonzero,
+    ):
+        batch_size = key_vectors.shape[0]
+
+        # create only global key vectors
+        key_vectors_only_global = key_vectors.new_zeros(
+            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
+        )
+
+        key_vectors_only_global[is_local_index_global_attn_nonzero] = key_vectors[is_index_global_attn_nonzero]
+
+        # (batch_size, seq_len, num_heads, max_num_global_attn_indices)
+        attn_probs_from_global_key = torch.einsum("blhd,bshd->blhs", (query_vectors, key_vectors_only_global))
+
+        # need to transpose since ONNX export only supports consecutive indexing: https://pytorch.org/docs/stable/onnx.html#writes-sets
+        attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)
+        attn_probs_from_global_key[
+            is_local_index_no_global_attn_nonzero[0], is_local_index_no_global_attn_nonzero[1], :, :
+        ] = torch.finfo(attn_probs_from_global_key.dtype).min
+        attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3)
+
+        return attn_probs_from_global_key
+
+    def _compute_attn_output_with_global_indices(
+        self,
+        value_vectors,
+        attn_probs,
+        max_num_global_attn_indices,
+        is_index_global_attn_nonzero,
+        is_local_index_global_attn_nonzero,
+    ):
+        batch_size = attn_probs.shape[0]
+
+        # cut local attn probs to global only
+        attn_probs_only_global = attn_probs.narrow(-1, 0, max_num_global_attn_indices)
+        # get value vectors for global only
+        value_vectors_only_global = value_vectors.new_zeros(
+            batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim
+        )
+        value_vectors_only_global[is_local_index_global_attn_nonzero] = value_vectors[is_index_global_attn_nonzero]
+
+        # use `matmul` because `einsum` crashes sometimes with fp16
+        # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v))
+        # compute attn output only global
+        attn_output_only_global = torch.matmul(
+            attn_probs_only_global.transpose(1, 2).clone(), value_vectors_only_global.transpose(1, 2).clone()
+        ).transpose(1, 2)
+
+        # reshape attn probs
+        attn_probs_without_global = attn_probs.narrow(
+            -1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices
+        ).contiguous()
+
+        # compute attn output with global
+        attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
+            attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
+        )
+        return attn_output_only_global + attn_output_without_global
+
+    def _compute_global_attn_output_from_hidden(
+        self,
+        hidden_states,
+        max_num_global_attn_indices,
+        layer_head_mask,
+        is_local_index_global_attn_nonzero,
+        is_index_global_attn_nonzero,
+        is_local_index_no_global_attn_nonzero,
+        is_index_masked,
+    ):
+        seq_len, batch_size = hidden_states.shape[:2]
+
+        # prepare global hidden states
+        global_attn_hidden_states = hidden_states.new_zeros(max_num_global_attn_indices, batch_size, self.embed_dim)
+        global_attn_hidden_states[is_local_index_global_attn_nonzero[::-1]] = hidden_states[
+            is_index_global_attn_nonzero[::-1]
+        ]
+
+        # global key, query, value
+        global_query_vectors_only_global = self.query_global(global_attn_hidden_states)
+        global_key_vectors = self.key_global(hidden_states)
+        global_value_vectors = self.value_global(hidden_states)
+
+        # normalize
+        global_query_vectors_only_global /= math.sqrt(self.head_dim)
+
+        # reshape
+        global_query_vectors_only_global = (
+            global_query_vectors_only_global.contiguous()
+            .view(max_num_global_attn_indices, batch_size * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )  # (batch_size * self.num_heads, max_num_global_attn_indices, head_dim)
+        global_key_vectors = (
+            global_key_vectors.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1)
+        )  # batch_size * self.num_heads, seq_len, head_dim)
+        global_value_vectors = (
+            global_value_vectors.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1)
+        )  # batch_size * self.num_heads, seq_len, head_dim)
+
+        # compute attn scores
+        global_attn_scores = torch.bmm(global_query_vectors_only_global, global_key_vectors.transpose(1, 2))
+
+        assert list(global_attn_scores.size()) == [
+            batch_size * self.num_heads,
+            max_num_global_attn_indices,
+            seq_len,
+        ], (
+            "global_attn_scores have the wrong size. Size should be"
+            f" {(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)}, but is"
+            f" {global_attn_scores.size()}."
+        )
+
+        global_attn_scores = global_attn_scores.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
+
+        # need to transpose since ONNX export only supports consecutive indexing: https://pytorch.org/docs/stable/onnx.html#writes-sets
+        global_attn_scores = global_attn_scores.transpose(1, 2)
+        global_attn_scores[
+            is_local_index_no_global_attn_nonzero[0], is_local_index_no_global_attn_nonzero[1], :, :
+        ] = torch.finfo(global_attn_scores.dtype).min
+        global_attn_scores = global_attn_scores.transpose(1, 2)
+
+        global_attn_scores = global_attn_scores.masked_fill(
+            is_index_masked[:, None, None, :],
+            torch.finfo(global_attn_scores.dtype).min,
+        )
+
+        global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)
+
+        # compute global attn probs
+        global_attn_probs_float = nn.functional.softmax(
+            global_attn_scores, dim=-1, dtype=torch.float32
+        )  # use fp32 for numerical stability
+
+        # apply layer head masking
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
+            global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
+                batch_size, self.num_heads, max_num_global_attn_indices, seq_len
+            )
+            global_attn_probs_float = global_attn_probs_float.view(
+                batch_size * self.num_heads, max_num_global_attn_indices, seq_len
+            )
+
+        global_attn_probs = nn.functional.dropout(
+            global_attn_probs_float.type_as(global_attn_scores), p=self.dropout, training=self.training
+        )
+
+        # global attn output
+        global_attn_output = torch.bmm(global_attn_probs, global_value_vectors)
+
+        assert list(global_attn_output.size()) == [
+            batch_size * self.num_heads,
+            max_num_global_attn_indices,
+            self.head_dim,
+        ], (
+            "global_attn_output tensor has the wrong size. Size should be"
+            f" {(batch_size * self.num_heads, max_num_global_attn_indices, self.head_dim)}, but is"
+            f" {global_attn_output.size()}."
+        )
+
+        global_attn_probs = global_attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
+        global_attn_output = global_attn_output.view(
+            batch_size, self.num_heads, max_num_global_attn_indices, self.head_dim
+        )
+        return global_attn_output, global_attn_probs
+
+
+class LEDEncoderAttention(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.longformer_self_attn = LEDEncoderSelfAttention(config, layer_id=layer_id)
+        self.output = nn.Linear(config.d_model, config.d_model)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        is_index_masked: Optional[torch.Tensor] = None,
+        is_index_global_attn: Optional[torch.Tensor] = None,
+        is_global_attn: Optional[bool] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        self_outputs = self.longformer_self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            is_index_masked=is_index_masked,
+            is_index_global_attn=is_index_global_attn,
+            is_global_attn=is_global_attn,
+            output_attentions=output_attentions,
+        )
+
+        attn_output = self.output(self_outputs[0])
+        outputs = (attn_output,) + self_outputs[1:]
+
+        return outputs
+
+
+class LEDDecoderAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: Optional[float] = 0.0,
+        is_decoder: Optional[bool] = False,
+        bias: Optional[bool] = True,
+        layer_idx: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = query_states.view(bsz, tgt_len, self.num_heads, self.head_dim).transpose(1, 2)
+        query_states = query_states.reshape(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_values
+
+
+class LEDEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: LEDConfig, layer_id: int):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = LEDEncoderAttention(config, layer_id)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        is_index_masked=None,
+        is_index_global_attn=None,
+        is_global_attn=None,
+        output_attentions=False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                *(encoder_attention_heads,)*.
+        """
+        residual = hidden_states
+        attn_outputs = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            is_index_masked=is_index_masked,
+            is_index_global_attn=is_index_global_attn,
+            is_global_attn=is_global_attn,
+            output_attentions=output_attentions,
+        )
+        hidden_states = attn_outputs[0]
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        return (hidden_states,) + attn_outputs[1:]
+
+
+class LEDDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: LEDConfig, layer_idx=None):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = LEDDecoderAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = LEDDecoderAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                *(decoder_attention_heads,)*.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for encoder attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+
+        # Self-Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (past_key_values,)
+
+        return outputs
+
+
+class LEDClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class LEDPreTrainedModel(PreTrainedModel):
+    config: LEDConfig
+    base_model_prefix = "led"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for LEDEncoder's outputs, with potential hidden states, local and global attentions.
+    """
+)
+# Copied from transformers.models.longformer.modeling_longformer.LongformerBaseModelOutput with Longformer->LEDEncoder
+class LEDEncoderBaseModelOutput(ModelOutput):
+    r"""
+    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
+        attention_window + 1)`, where `x` is the number of tokens with global attention mask.
+
+        Local attentions weights after the attention softmax, used to compute the weighted average in the
+        self-attention heads. Those are the attention weights from every token in the sequence to every token with
+        global attention (first `x` values) and to every token in the attention window (remaining `attention_window
+        + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
+        remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
+        token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
+        (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
+        If the attention window contains a token with global attention, the attention weight at the corresponding
+        index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
+        attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
+        accessed from `global_attentions`.
+    global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+        where `x` is the number of tokens with global attention mask.
+
+        Global attentions weights after the attention softmax, used to compute the weighted average in the
+        self-attention heads. Those are the attention weights from every token with global attention to every token
+        in the sequence.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    global_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+    """
+)
+class LEDSeq2SeqModelOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+        num_heads, sequence_length, embed_size_per_head)`).
+
+        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+        used (see `past_key_values` input) to speed up sequential decoding.
+    encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+        where `x` is the number of tokens with global attention mask.
+
+        Global attentions weights after the attention softmax, used to compute the weighted average in the
+        self-attention heads. Those are the attention weights from every token with global attention to every token
+        in the sequence.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_global_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for sequence-to-sequence language models outputs.
+    """
+)
+class LEDSeq2SeqLMOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+        num_heads, sequence_length, embed_size_per_head)`).
+
+        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+        used (see `past_key_values` input) to speed up sequential decoding.
+    encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+        where `x` is the number of tokens with global attention mask.
+
+        Global attentions weights after the attention softmax, used to compute the weighted average in the
+        self-attention heads. Those are the attention weights from every token with global attention to every token
+        in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_global_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of sequence-to-sequence sentence classification models.
+    """
+)
+class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
+        Classification (or regression if config.num_labels==1) loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Classification (or regression if config.num_labels==1) scores (before SoftMax).
+    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+        num_heads, sequence_length, embed_size_per_head)`).
+
+        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+        used (see `past_key_values` input) to speed up sequential decoding.
+    encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+        where `x` is the number of tokens with global attention mask.
+
+        Global attentions weights after the attention softmax, used to compute the weighted average in the
+        self-attention heads. Those are the attention weights from every token with global attention to every token
+        in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_global_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of sequence-to-sequence question answering models.
+    """
+)
+class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+        num_heads, sequence_length, embed_size_per_head)`).
+
+        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+        used (see `past_key_values` input) to speed up sequential decoding.
+    encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
+        where `x` is the number of tokens with global attention mask.
+
+        Global attentions weights after the attention softmax, used to compute the weighted average in the
+        self-attention heads. Those are the attention weights from every token with global attention to every token
+        in the sequence.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    encoder_global_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class LEDEncoder(LEDPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
+    [`LEDEncoderLayer`].
+
+    Args:
+        config: LEDConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_encoder_position_embeddings
+
+        if isinstance(config.attention_window, int):
+            if config.attention_window % 2 != 0:
+                raise ValueError("`config.attention_window` has to be an even value")
+            if config.attention_window <= 0:
+                raise ValueError("`config.attention_window` has to be positive")
+            config.attention_window = [config.attention_window] * config.num_hidden_layers  # one value per layer
+        else:
+            if len(config.attention_window) != config.num_hidden_layers:
+                raise ValueError(
+                    "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
+                    f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
+                )
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = LEDLearnedPositionalEmbedding(
+            self.max_source_positions,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([LEDEncoderLayer(config, i) for i in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
+        # longformer self-attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
+        # (global_attention_mask + 1) => 1 for local attention, 2 for global attention
+        # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
+        if attention_mask is not None:
+            attention_mask = attention_mask * (global_attention_mask + 1)
+        else:
+            # simply use `global_attention_mask` as `attention_mask`
+            # if no `attention_mask` is given
+            attention_mask = global_attention_mask + 1
+        return attention_mask
+
+    def _pad_to_window_size(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        pad_token_id: int,
+    ):
+        """A helper function to pad tokens and mask to work with implementation of Longformer self-attention."""
+        # padding
+        attention_window = (
+            self.config.attention_window
+            if isinstance(self.config.attention_window, int)
+            else max(self.config.attention_window)
+        )
+
+        if attention_window % 2 != 0:
+            raise ValueError(f"`attention_window` should be an even value. Given {attention_window}")
+        input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
+        batch_size, seq_len = input_shape[:2]
+
+        padding_len = (attention_window - seq_len % attention_window) % attention_window
+        if padding_len > 0:
+            logger.warning_once(
+                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
+                f"`config.attention_window`: {attention_window}"
+            )
+            if input_ids is not None:
+                input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
+            if inputs_embeds is not None:
+                input_ids_padding = inputs_embeds.new_full(
+                    (batch_size, padding_len),
+                    self.config.pad_token_id,
+                    dtype=torch.long,
+                )
+                inputs_embeds_padding = self.embed_tokens(input_ids_padding)
+                inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)
+
+            attention_mask = nn.functional.pad(
+                attention_mask, (0, padding_len), value=False
+            )  # no attention on the padding tokens
+
+        return padding_len, input_ids, attention_mask, inputs_embeds
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to decide the attention given on each token, local attention or global attention for the encoder.
+                Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
+                important for task-specific finetuning because it makes the model more flexible at representing the
+                task. For example, for classification, the <s> token should be given global attention. For QA, all
+                question tokens should also have global attention. Please refer to the [Longformer
+                paper](https://huggingface.co/papers/2004.05150) for more details. Mask values selected in `[0, 1]`:
+
+                - 0 for local attention (a sliding window attention),
+                - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # check input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # create default attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones(inputs_embeds.size()[:-1], device=inputs_embeds.device, dtype=torch.long)
+
+        # merge `global_attention_mask` and `attention_mask`
+        if global_attention_mask is not None:
+            attention_mask = self._merge_to_attention_mask(attention_mask, global_attention_mask)
+
+        # pad input if necessary
+        padding_len, input_ids, attention_mask, inputs_embeds = self._pad_to_window_size(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            pad_token_id=self.config.pad_token_id,
+        )
+
+        # retrieve input_shape
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+
+        # convert attention_mask to float
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, seq_len]; 1 -> 0.0; 0 -> "-inf"
+            attention_mask = _prepare_4d_attention_mask_inverted(attention_mask, inputs_embeds.dtype)[:, 0, 0, :]
+
+        # get masking tensors
+        is_index_masked = attention_mask < 0
+        is_index_global_attn = attention_mask > 0
+        is_global_attn = is_index_global_attn.flatten().any().item()
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_global_attentions = () if (output_attentions and is_global_attn) else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    is_index_masked=is_index_masked,
+                    is_index_global_attn=is_index_global_attn,
+                    is_global_attn=is_global_attn,
+                    output_attentions=output_attentions,
+                )
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                # bzs x seq_len x num_attn_heads x (num_global_attn + attention_window_len + 1) => bzs x num_attn_heads x seq_len x (num_global_attn + attention_window_len + 1)
+                all_attentions = all_attentions + (layer_outputs[1].transpose(1, 2),)
+
+                if is_global_attn:
+                    # bzs x num_attn_heads x num_global_attn x seq_len => bzs x num_attn_heads x seq_len x num_global_attn
+                    all_global_attentions = all_global_attentions + (layer_outputs[2].transpose(2, 3),)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        # undo padding
+        if padding_len > 0:
+            # unpad `hidden_states` because the calling function is expecting a length == input_ids.size(1)
+            hidden_states = hidden_states[:, :-padding_len]
+            if output_hidden_states:
+                encoder_states = tuple(state[:, :-padding_len] for state in encoder_states)
+
+            if output_attentions:
+                all_attentions = tuple(state[:, :, :-padding_len, :] for state in all_attentions)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_states, all_attentions, all_global_attentions] if v is not None
+            )
+        return LEDEncoderBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+            global_attentions=all_global_attentions,
+        )
+
+
+class LEDDecoder(LEDPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`LEDDecoderLayer`]
+
+    Args:
+        config: LEDConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_decoder_position_embeddings
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = LEDLearnedPositionalEmbedding(
+            self.max_target_positions,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([LEDDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to decide the attention given on each token, local attention or global attention. Tokens with
+                global attention attends to all other tokens, and all other tokens attend to them. This is important
+                for task-specific finetuning because it makes the model more flexible at representing the task. For
+                example, for classification, the <s> token should be given global attention. For QA, all question
+                tokens should also have global attention. Please refer to the [Longformer
+                paper](https://huggingface.co/papers/2004.05150) for more details. Mask values selected in `[0, 1]`:
+
+                - 0 for local attention (a sliding window attention),
+                - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _create_4d_causal_attention_mask(
+                input_shape, inputs_embeds.dtype, inputs_embeds.device, past_key_values_length=past_key_values_length
+            )
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _prepare_4d_attention_mask_inverted(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask_inverted(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                combined_attention_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring
+class LEDModel(LEDPreTrainedModel):
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+
+    def __init__(self, config: LEDConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = LEDEncoder(config, self.shared)
+        self.decoder = LEDDecoder(config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], LEDSeq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read [`modeling_led._prepare_decoder_inputs`] and modify
+            to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more information on the
+            default strategy.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to decide the attention given on each token, local attention or global attention for the encoder.
+            Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
+            important for task-specific finetuning because it makes the model more flexible at representing the task.
+            For example, for classification, the <s> token should be given global attention. For QA, all question
+            tokens should also have global attention. Please refer to the [Longformer
+            paper](https://huggingface.co/papers/2004.05150) for more details. Mask values selected in `[0, 1]`:
+
+            - 0 for local attention (a sliding window attention),
+            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Using this like Bart, as LED is derived from it. So far
+        # No checkpoint on the hub exists that uses that in practice.
+        # https://github.com/huggingface/transformers/blob/ac3cb660cad283163f7c73cad511124e845ca388/src/transformers/models/bart/modeling_bart.py#L1153
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                global_attention_mask=global_attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a LEDEncoderBaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, LEDEncoderBaseModelOutput):
+            encoder_outputs = LEDEncoderBaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+                global_attentions=encoder_outputs[3] if len(encoder_outputs) > 3 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return LEDSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            encoder_global_attentions=encoder_outputs.global_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The LED Model with a language modeling head. Can be used for summarization.
+    """
+)
+class LEDForConditionalGeneration(LEDPreTrainedModel, GenerationMixin):
+    base_model_prefix = "led"
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config: LEDConfig):
+        super().__init__(config)
+        self.led = LEDModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.led.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.led.shared.num_embeddings, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.led.get_encoder()
+
+    def get_decoder(self):
+        return self.led.get_decoder()
+
+    def resize_token_embeddings(
+        self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
+    ) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], LEDSeq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read [`modeling_led._prepare_decoder_inputs`] and modify
+            to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more information on the
+            default strategy.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to decide the attention given on each token, local attention or global attention for the encoder.
+            Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
+            important for task-specific finetuning because it makes the model more flexible at representing the task.
+            For example, for classification, the <s> token should be given global attention. For QA, all question
+            tokens should also have global attention. Please refer to the [Longformer
+            paper](https://huggingface.co/papers/2004.05150) for more details. Mask values selected in `[0, 1]`:
+
+            - 0 for local attention (a sliding window attention),
+            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example Summarization:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, LEDForConditionalGeneration
+
+        >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
+
+        >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
+        ...     results in a wide range of natural language tasks including generative language modeling
+        ...     (Dai et al., 2019; Radford et al., 2019) and discriminative ... language understanding (Devlin et al., 2019).
+        ...     This success is partly due to the self-attention component which enables the network to capture contextual
+        ...     information from the entire sequence. While powerful, the memory and computational requirements of
+        ...     self-attention grow quadratically with sequence length, making it infeasible (or very expensive) to
+        ...     process long sequences. To address this limitation, we present Longformer, a modified Transformer
+        ...     architecture with a self-attention operation that scales linearly with the sequence length, making it
+        ...     versatile for processing long documents (Fig 1). This is an advantage for natural language tasks such as
+        ...     long document classification, question answering (QA), and coreference resolution, where existing approaches
+        ...     partition or shorten the long context into smaller sequences that fall within the typical 512 token limit
+        ...     of BERT-style pretrained models. Such partitioning could potentially result in loss of important
+        ...     cross-partition information, and to mitigate this problem, existing methods often rely on complex
+        ...     architectures to address such interactions. On the other hand, our proposed Longformer is able to build
+        ...     contextual representations of the entire context using multiple layers of attention, reducing the need for
+        ...     task-specific architectures.'''
+        >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors="pt")
+
+        >>> # Global attention on the first token (cf. Beltagy et al. 2020)
+        >>> global_attention_mask = torch.zeros_like(inputs)
+        >>> global_attention_mask[:, 0] = 1
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=32)
+        >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+        ```
+
+        Example Conditional generation :
+
+        ```python
+        >>> from transformers import AutoTokenizer, LEDForConditionalGeneration
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+        >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
+        >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+
+        >>> prediction = model.generate(input_ids)[0]
+        >>> print(tokenizer.decode(prediction, skip_special_tokens=True))
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.led(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return LEDSeq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            encoder_global_attentions=outputs.encoder_global_attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+
+@auto_docstring(
+    custom_intro="""
+    LED model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """
+)
+class LEDForSequenceClassification(LEDPreTrainedModel):
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+
+    def __init__(self, config: LEDConfig, **kwargs):
+        warnings.warn(
+            "The `transformers.LEDForSequenceClassification` class is deprecated and will be removed in version 5 of"
+            " Transformers. No actual method were provided in the original paper on how to perform"
+            " sequence classification.",
+            FutureWarning,
+        )
+        super().__init__(config, **kwargs)
+        self.led = LEDModel(config)
+        self.classification_head = LEDClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout,
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], LEDSeq2SeqSequenceClassifierOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read [`modeling_led._prepare_decoder_inputs`] and modify
+            to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more information on the
+            default strategy.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to decide the attention given on each token, local attention or global attention for the encoder.
+            Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
+            important for task-specific finetuning because it makes the model more flexible at representing the task.
+            For example, for classification, the <s> token should be given global attention. For QA, all question
+            tokens should also have global attention. Please refer to the [Longformer
+            paper](https://huggingface.co/papers/2004.05150) for more details. Mask values selected in `[0, 1]`:
+
+            - 0 for local attention (a sliding window attention),
+            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.led(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]  # last hidden state
+
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
+            :, -1, :
+        ]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return LEDSeq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            encoder_global_attentions=outputs.encoder_global_attentions,
+        )
+
+
+@auto_docstring
+class LEDForQuestionAnswering(LEDPreTrainedModel):
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.led = LEDModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        global_attention_mask: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], LEDSeq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read [`modeling_led._prepare_decoder_inputs`] and modify
+            to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more information on the
+            default strategy.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to decide the attention given on each token, local attention or global attention for the encoder.
+            Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
+            important for task-specific finetuning because it makes the model more flexible at representing the task.
+            For example, for classification, the <s> token should be given global attention. For QA, all question
+            tokens should also have global attention. Please refer to the [Longformer
+            paper](https://huggingface.co/papers/2004.05150) for more details. Mask values selected in `[0, 1]`:
+
+            - 0 for local attention (a sliding window attention),
+            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        outputs = self.led(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return LEDSeq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            encoder_global_attentions=outputs.encoder_global_attentions,
+        )
+
+
+__all__ = [
+    "LEDForConditionalGeneration",
+    "LEDForQuestionAnswering",
+    "LEDForSequenceClassification",
+    "LEDModel",
+    "LEDPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/led/modeling_tf_led.py b/phivenv/Lib/site-packages/transformers/models/led/modeling_tf_led.py
new file mode 100644
index 0000000000000000000000000000000000000000..f499ffac30c9f8324a3d16c1bb4a0ad9638337bc
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/led/modeling_tf_led.py
@@ -0,0 +1,2663 @@
+# coding=utf-8
+# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 LED model."""
+
+from __future__ import annotations
+
+import random
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions
+
+# Public API
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_led import LEDConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "allenai/led-base-16384"
+_CONFIG_FOR_DOC = "LEDConfig"
+
+
+LARGE_NEGATIVE = -1e8
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right
+def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
+    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
+    start_tokens = tf.fill(
+        (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
+    )
+    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids = tf.where(
+        shifted_input_ids == -100,
+        tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
+        shifted_input_ids,
+    )
+
+    # "Verify that `labels` has only positive values and -100"
+    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
+
+    # Make sure the assertion op is called by wrapping the result in an identity no-op
+    with tf.control_dependencies([assert_gte0]):
+        shifted_input_ids = tf.identity(shifted_input_ids)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz = input_ids_shape[0]
+    tgt_len = input_ids_shape[1]
+    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
+    mask_cond = tf.range(shape_list(mask)[-1])
+
+    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
+
+    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: int | None = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+class TFLEDLearnedPositionalEmbedding(keras.layers.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
+        super().__init__(num_embeddings, embedding_dim, **kwargs)
+
+    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
+        """Input is expected to be of size [bsz x seqlen]."""
+        seq_len = input_shape[1]
+        position_ids = tf.range(seq_len, delta=1, name="range")
+        position_ids += past_key_values_length
+
+        return super().call(tf.cast(position_ids, dtype=tf.int32))
+
+
+# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerSelfAttention with TFLongformer->TFLEDEncoder
+class TFLEDEncoderSelfAttention(keras.layers.Layer):
+    def __init__(self, config, layer_id, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads}"
+            )
+
+        self.num_heads = config.num_attention_heads
+        self.head_dim = int(config.hidden_size / config.num_attention_heads)
+        self.embed_dim = config.hidden_size
+        self.query = keras.layers.Dense(
+            self.embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="query",
+        )
+        self.key = keras.layers.Dense(
+            self.embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="key",
+        )
+        self.value = keras.layers.Dense(
+            self.embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="value",
+        )
+
+        # separate projection layers for tokens with global attention
+        self.query_global = keras.layers.Dense(
+            self.embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="query_global",
+        )
+        self.key_global = keras.layers.Dense(
+            self.embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="key_global",
+        )
+        self.value_global = keras.layers.Dense(
+            self.embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="value_global",
+        )
+        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.global_dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.layer_id = layer_id
+        attention_window = config.attention_window[self.layer_id]
+
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )
+
+        self.one_sided_attn_window_size = attention_window // 2
+
+    def build(self, input_shape=None):
+        if not self.built:
+            with tf.name_scope("query_global"):
+                self.query_global.build((self.config.hidden_size,))
+            with tf.name_scope("key_global"):
+                self.key_global.build((self.config.hidden_size,))
+            with tf.name_scope("value_global"):
+                self.value_global.build((self.config.hidden_size,))
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+        if getattr(self, "query_global", None) is not None:
+            with tf.name_scope(self.query_global.name):
+                self.query_global.build([None, None, self.config.hidden_size])
+        if getattr(self, "key_global", None) is not None:
+            with tf.name_scope(self.key_global.name):
+                self.key_global.build([None, None, self.config.hidden_size])
+        if getattr(self, "value_global", None) is not None:
+            with tf.name_scope(self.value_global.name):
+                self.value_global.build([None, None, self.config.hidden_size])
+
+    def call(
+        self,
+        inputs,
+        training=False,
+    ):
+        """
+        LongformerSelfAttention expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
+        *attention_window* happens in LongformerModel.forward to avoid redoing the padding on each layer.
+
+        The *attention_mask* is changed in [`LongformerModel.forward`] from 0, 1, 2 to:
+
+            - -10000: no attention
+            - 0: local attention
+            - +10000: global attention
+        """
+        # retrieve input args
+        (
+            hidden_states,
+            attention_mask,
+            layer_head_mask,
+            is_index_masked,
+            is_index_global_attn,
+            is_global_attn,
+        ) = inputs
+
+        # project hidden states
+        query_vectors = self.query(hidden_states)
+        key_vectors = self.key(hidden_states)
+        value_vectors = self.value(hidden_states)
+        batch_size, seq_len, embed_dim = shape_list(hidden_states)
+
+        tf.debugging.assert_equal(
+            embed_dim,
+            self.embed_dim,
+            message=f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}",
+        )
+
+        # normalize query
+        query_vectors /= tf.math.sqrt(tf.cast(self.head_dim, dtype=query_vectors.dtype))
+        query_vectors = tf.reshape(query_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
+        key_vectors = tf.reshape(key_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
+
+        # attn_probs = (batch_size, seq_len, num_heads, window*2+1)
+        attn_scores = self._sliding_chunks_query_key_matmul(
+            query_vectors, key_vectors, self.one_sided_attn_window_size
+        )
+
+        # values to pad for attention probs
+        remove_from_windowed_attention_mask = attention_mask != 0
+        # cast to fp32/fp16 then replace 1's with -inf
+        float_mask = tf.cast(remove_from_windowed_attention_mask, dtype=query_vectors.dtype) * LARGE_NEGATIVE
+
+        # diagonal mask with zeros everywhere and -inf inplace of padding
+        diagonal_mask = self._sliding_chunks_query_key_matmul(
+            tf.ones(shape_list(attention_mask)),
+            float_mask,
+            self.one_sided_attn_window_size,
+        )
+
+        # pad local attention probs
+        attn_scores += diagonal_mask
+
+        tf.debugging.assert_equal(
+            shape_list(attn_scores),
+            [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1],
+            message=(
+                f"attn_probs should be of size ({batch_size}, {seq_len}, {self.num_heads},"
+                f" {self.one_sided_attn_window_size * 2 + 1}), but is of size {shape_list(attn_scores)}"
+            ),
+        )
+
+        # compute global attn indices required through out forward fn
+        (
+            max_num_global_attn_indices,
+            is_index_global_attn_nonzero,
+            is_local_index_global_attn_nonzero,
+            is_local_index_no_global_attn_nonzero,
+        ) = self._get_global_attn_indices(is_index_global_attn)
+
+        # this function is only relevant for global attention
+        if is_global_attn:
+            attn_scores = self._concat_with_global_key_attn_probs(
+                attn_scores=attn_scores,
+                query_vectors=query_vectors,
+                key_vectors=key_vectors,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
+            )
+
+        attn_probs = stable_softmax(attn_scores, axis=-1)
+
+        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
+        # Make sure to create a mask with the proper shape:
+        # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
+        # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
+        if is_global_attn:
+            masked_index = tf.tile(
+                is_index_masked[:, :, None, None],
+                (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
+            )
+        else:
+            masked_index = tf.tile(
+                is_index_masked[:, :, None, None],
+                (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
+            )
+        attn_probs = tf.where(
+            masked_index,
+            tf.zeros(shape_list(masked_index), dtype=attn_probs.dtype),
+            attn_probs,
+        )
+
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+
+            attn_probs = tf.reshape(layer_head_mask, (1, 1, -1, 1)) * attn_probs
+
+        # apply dropout
+        attn_probs = self.dropout(attn_probs, training=training)
+        value_vectors = tf.reshape(value_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
+
+        # if global attention, compute sum of global and local attn
+
+        if is_global_attn:
+            attn_output = self._compute_attn_output_with_global_indices(
+                value_vectors=value_vectors,
+                attn_probs=attn_probs,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+            )
+        else:
+            attn_output = self._sliding_chunks_matmul_attn_probs_value(
+                attn_probs, value_vectors, self.one_sided_attn_window_size
+            )
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output), [batch_size, seq_len, self.num_heads, self.head_dim], message="Unexpected size"
+        )
+
+        attn_output = tf.reshape(attn_output, (batch_size, seq_len, embed_dim))
+
+        # compute value for global attention and overwrite to attention output
+        if is_global_attn:
+            attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
+                attn_output=attn_output,
+                hidden_states=hidden_states,
+                max_num_global_attn_indices=max_num_global_attn_indices,
+                layer_head_mask=layer_head_mask,
+                is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
+                is_index_global_attn_nonzero=is_index_global_attn_nonzero,
+                is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
+                is_index_masked=is_index_masked,
+                training=training,
+            )
+        else:
+            # Leave attn_output unchanged
+            global_attn_probs = tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))
+
+        # make sure that local attention probabilities are set to 0 for indices of global attn
+        # Make sure to create a mask with the proper shape:
+        # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
+        # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
+        if is_global_attn:
+            masked_global_attn_index = tf.tile(
+                is_index_global_attn[:, :, None, None],
+                (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
+            )
+        else:
+            masked_global_attn_index = tf.tile(
+                is_index_global_attn[:, :, None, None],
+                (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
+            )
+        attn_probs = tf.where(
+            masked_global_attn_index,
+            tf.zeros(shape_list(masked_global_attn_index), dtype=attn_probs.dtype),
+            attn_probs,
+        )
+
+        outputs = (attn_output, attn_probs, global_attn_probs)
+
+        return outputs
+
+    def _sliding_chunks_query_key_matmul(self, query, key, window_overlap):
+        """
+        Matrix multiplication of query and key tensors using with a sliding window attention pattern. This
+        implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an
+        overlap of size window_overlap
+        """
+        batch_size, seq_len, num_heads, head_dim = shape_list(query)
+
+        tf.debugging.assert_equal(
+            seq_len % (window_overlap * 2),
+            0,
+            message=f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}",
+        )
+        tf.debugging.assert_equal(
+            shape_list(query),
+            shape_list(key),
+            message=(
+                f"Shape of query and key should be equal, but got query: {shape_list(query)} and key:"
+                f" {shape_list(key)}"
+            ),
+        )
+
+        chunks_count = seq_len // window_overlap - 1
+
+        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size window_overlap * 2
+        query = tf.reshape(
+            tf.transpose(query, (0, 2, 1, 3)),
+            (batch_size * num_heads, seq_len, head_dim),
+        )
+        key = tf.reshape(tf.transpose(key, (0, 2, 1, 3)), (batch_size * num_heads, seq_len, head_dim))
+        chunked_query = self._chunk(query, window_overlap)
+        chunked_key = self._chunk(key, window_overlap)
+
+        # matrix multiplication
+        # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
+        # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
+        # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap
+        chunked_query = tf.cast(chunked_query, dtype=chunked_key.dtype)
+        chunked_attention_scores = tf.einsum("bcxd,bcyd->bcxy", chunked_query, chunked_key)  # multiply
+
+        # convert diagonals into columns
+        paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 1], [0, 0]])
+        diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(chunked_attention_scores, paddings)
+
+        # allocate space for the overall attention matrix where the chunks are combined. The last dimension
+        # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns are the window_overlap lower triangles (attention from a word to
+        # window_overlap previous words). The following column is attention score from each word to itself, then
+        # followed by window_overlap columns for the upper triangle.
+
+        # copy parts from diagonal_chunked_attention_scores into the combined matrix of attentions
+        # - copying the main diagonal and the upper triangle
+        # TODO: This code is most likely not very efficient and should be improved
+        diagonal_attn_scores_up_triang = tf.concat(
+            [
+                diagonal_chunked_attention_scores[:, :, :window_overlap, : window_overlap + 1],
+                diagonal_chunked_attention_scores[:, -1:, window_overlap:, : window_overlap + 1],
+            ],
+            axis=1,
+        )
+
+        # - copying the lower triangle
+        diagonal_attn_scores_low_triang = tf.concat(
+            [
+                tf.zeros(
+                    (batch_size * num_heads, 1, window_overlap, window_overlap),
+                    dtype=diagonal_chunked_attention_scores.dtype,
+                ),
+                diagonal_chunked_attention_scores[:, :, -(window_overlap + 1) : -1, window_overlap + 1 :],
+            ],
+            axis=1,
+        )
+        diagonal_attn_scores_first_chunk = tf.concat(
+            [
+                tf.roll(
+                    diagonal_chunked_attention_scores,
+                    shift=[1, window_overlap],
+                    axis=[2, 3],
+                )[:, :, :window_overlap, :window_overlap],
+                tf.zeros(
+                    (batch_size * num_heads, 1, window_overlap, window_overlap),
+                    dtype=diagonal_chunked_attention_scores.dtype,
+                ),
+            ],
+            axis=1,
+        )
+        first_chunk_mask = (
+            tf.tile(
+                tf.range(chunks_count + 1, dtype=tf.int64)[None, :, None, None],
+                (batch_size * num_heads, 1, window_overlap, window_overlap),
+            )
+            < 1
+        )
+        diagonal_attn_scores_low_triang = tf.where(
+            first_chunk_mask,
+            diagonal_attn_scores_first_chunk,
+            diagonal_attn_scores_low_triang,
+        )
+
+        # merging upper and lower triangle
+        diagonal_attention_scores = tf.concat(
+            [diagonal_attn_scores_low_triang, diagonal_attn_scores_up_triang], axis=-1
+        )
+
+        # separate batch_size and num_heads dimensions again
+        diagonal_attention_scores = tf.transpose(
+            tf.reshape(
+                diagonal_attention_scores,
+                (batch_size, num_heads, seq_len, 2 * window_overlap + 1),
+            ),
+            (0, 2, 1, 3),
+        )
+
+        diagonal_attention_scores = self._mask_invalid_locations(diagonal_attention_scores, window_overlap)
+
+        return diagonal_attention_scores
+
+    @staticmethod
+    def _mask_invalid_locations(input_tensor, window_overlap):
+        # create correct upper triangle bool mask
+        mask_2d_upper = tf.reverse(
+            tf.linalg.band_part(tf.ones(shape=(window_overlap, window_overlap + 1)), -1, 0),
+            axis=[0],
+        )
+
+        # pad to full matrix
+        padding = tf.convert_to_tensor(
+            [[0, shape_list(input_tensor)[1] - window_overlap], [0, shape_list(input_tensor)[3] - window_overlap - 1]]
+        )
+
+        # create lower mask
+        mask_2d = tf.pad(mask_2d_upper, padding)
+
+        # combine with upper mask
+        mask_2d = mask_2d + tf.reverse(mask_2d, axis=[0, 1])
+
+        # broadcast to full matrix
+        mask_4d = tf.tile(mask_2d[None, :, None, :], (shape_list(input_tensor)[0], 1, 1, 1))
+
+        # inf tensor used for masking
+        inf_tensor = -float("inf") * tf.ones_like(input_tensor)
+
+        # mask
+        input_tensor = tf.where(tf.math.greater(mask_4d, 0), inf_tensor, input_tensor)
+
+        return input_tensor
+
+    def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap):
+        """
+        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
+        same shape as `attn_probs`
+        """
+
+        batch_size, seq_len, num_heads, head_dim = shape_list(value)
+
+        tf.debugging.assert_equal(
+            seq_len % (window_overlap * 2), 0, message="Seq_len has to be multiple of 2 * window_overlap"
+        )
+        tf.debugging.assert_equal(
+            shape_list(attn_probs)[:3],
+            shape_list(value)[:3],
+            message="value and attn_probs must have same dims (except head_dim)",
+        )
+        tf.debugging.assert_equal(
+            shape_list(attn_probs)[3],
+            2 * window_overlap + 1,
+            message="attn_probs last dim has to be 2 * window_overlap + 1",
+        )
+
+        chunks_count = seq_len // window_overlap - 1
+
+        # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size 2 window overlap
+        chunked_attn_probs = tf.reshape(
+            tf.transpose(attn_probs, (0, 2, 1, 3)),
+            (
+                batch_size * num_heads,
+                seq_len // window_overlap,
+                window_overlap,
+                2 * window_overlap + 1,
+            ),
+        )
+
+        # group batch_size and num_heads dimensions into one
+        value = tf.reshape(
+            tf.transpose(value, (0, 2, 1, 3)),
+            (batch_size * num_heads, seq_len, head_dim),
+        )
+
+        # pad seq_len with w at the beginning of the sequence and another window overlap at the end
+        paddings = tf.convert_to_tensor([[0, 0], [window_overlap, window_overlap], [0, 0]])
+        padded_value = tf.pad(value, paddings, constant_values=-1)
+
+        # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
+        frame_size = 3 * window_overlap * head_dim
+        frame_hop_size = (shape_list(padded_value)[1] * head_dim - frame_size) // chunks_count
+        chunked_value = tf.signal.frame(
+            tf.reshape(padded_value, (batch_size * num_heads, -1)),
+            frame_size,
+            frame_hop_size,
+        )
+        chunked_value = tf.reshape(
+            chunked_value,
+            (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim),
+        )
+
+        tf.debugging.assert_equal(
+            shape_list(chunked_value),
+            [batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim],
+            message="Chunked value has the wrong shape",
+        )
+
+        chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs)
+        context = tf.einsum("bcwd,bcdh->bcwh", chunked_attn_probs, chunked_value)
+        context = tf.transpose(
+            tf.reshape(context, (batch_size, num_heads, seq_len, head_dim)),
+            (0, 2, 1, 3),
+        )
+
+        return context
+
+    @staticmethod
+    def _pad_and_transpose_last_two_dims(hidden_states_padded, paddings):
+        """pads rows and then flips rows and columns"""
+        hidden_states_padded = tf.pad(
+            hidden_states_padded, paddings
+        )  # padding value is not important because it will be overwritten
+        batch_size, chunk_size, seq_length, hidden_dim = shape_list(hidden_states_padded)
+        hidden_states_padded = tf.reshape(hidden_states_padded, (batch_size, chunk_size, hidden_dim, seq_length))
+
+        return hidden_states_padded
+
+    @staticmethod
+    def _pad_and_diagonalize(chunked_hidden_states):
+        """
+        shift every row 1 step right, converting columns into diagonals.
+
+        Example:
+
+        ```python
+        chunked_hidden_states: [
+            0.4983,
+            2.6918,
+            -0.0071,
+            1.0492,
+            -1.8348,
+            0.7672,
+            0.2986,
+            0.0285,
+            -0.7584,
+            0.4206,
+            -0.0405,
+            0.1599,
+            2.0514,
+            -1.1600,
+            0.5372,
+            0.2629,
+        ]
+        window_overlap = num_rows = 4
+        ```
+
+                     (pad & diagonalize) => [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
+                       0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, 0.0000, -0.7584, 0.4206,
+                       -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
+        """
+        total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
+        paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
+        chunked_hidden_states = tf.pad(
+            chunked_hidden_states, paddings
+        )  # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
+        chunked_hidden_states = tf.reshape(
+            chunked_hidden_states, (total_num_heads, num_chunks, -1)
+        )  # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap+window_overlap
+        chunked_hidden_states = chunked_hidden_states[
+            :, :, :-window_overlap
+        ]  # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap
+        chunked_hidden_states = tf.reshape(
+            chunked_hidden_states,
+            (total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim),
+        )  # total_num_heads x num_chunks, window_overlap x hidden_dim+window_overlap
+        chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
+
+        return chunked_hidden_states
+
+    @staticmethod
+    def _chunk(hidden_states, window_overlap):
+        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
+        batch_size, seq_length, hidden_dim = shape_list(hidden_states)
+        num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1
+
+        # define frame size and frame stride (similar to convolution)
+        frame_hop_size = window_overlap * hidden_dim
+        frame_size = 2 * frame_hop_size
+        hidden_states = tf.reshape(hidden_states, (batch_size, seq_length * hidden_dim))
+
+        # chunk with overlap
+        chunked_hidden_states = tf.signal.frame(hidden_states, frame_size, frame_hop_size)
+
+        tf.debugging.assert_equal(
+            shape_list(chunked_hidden_states),
+            [batch_size, num_output_chunks, frame_size],
+            message=(
+                "Make sure chunking is correctly applied. `Chunked hidden states should have output  dimension"
+                f" {[batch_size, frame_size, num_output_chunks]}, but got {shape_list(chunked_hidden_states)}."
+            ),
+        )
+
+        chunked_hidden_states = tf.reshape(
+            chunked_hidden_states,
+            (batch_size, num_output_chunks, 2 * window_overlap, hidden_dim),
+        )
+
+        return chunked_hidden_states
+
+    @staticmethod
+    def _get_global_attn_indices(is_index_global_attn):
+        """compute global attn indices required throughout forward pass"""
+        # helper variable
+        num_global_attn_indices = tf.math.count_nonzero(is_index_global_attn, axis=1)
+        num_global_attn_indices = tf.cast(num_global_attn_indices, dtype=tf.constant(1).dtype)
+
+        # max number of global attn indices in batch
+        max_num_global_attn_indices = tf.reduce_max(num_global_attn_indices)
+
+        # indices of global attn
+        is_index_global_attn_nonzero = tf.where(is_index_global_attn)
+
+        # helper variable
+        is_local_index_global_attn = tf.range(max_num_global_attn_indices) < tf.expand_dims(
+            num_global_attn_indices, axis=-1
+        )
+
+        # location of the non-padding values within global attention indices
+        is_local_index_global_attn_nonzero = tf.where(is_local_index_global_attn)
+
+        # location of the padding values within global attention indices
+        is_local_index_no_global_attn_nonzero = tf.where(tf.math.logical_not(is_local_index_global_attn))
+
+        return (
+            max_num_global_attn_indices,
+            is_index_global_attn_nonzero,
+            is_local_index_global_attn_nonzero,
+            is_local_index_no_global_attn_nonzero,
+        )
+
+    def _concat_with_global_key_attn_probs(
+        self,
+        attn_scores,
+        key_vectors,
+        query_vectors,
+        max_num_global_attn_indices,
+        is_index_global_attn_nonzero,
+        is_local_index_global_attn_nonzero,
+        is_local_index_no_global_attn_nonzero,
+    ):
+        batch_size = shape_list(key_vectors)[0]
+
+        # select global key vectors
+        global_key_vectors = tf.gather_nd(key_vectors, is_index_global_attn_nonzero)
+
+        # create only global key vectors
+        key_vectors_only_global = tf.scatter_nd(
+            is_local_index_global_attn_nonzero,
+            global_key_vectors,
+            shape=(
+                batch_size,
+                max_num_global_attn_indices,
+                self.num_heads,
+                self.head_dim,
+            ),
+        )
+
+        # (batch_size, seq_len, num_heads, max_num_global_attn_indices)
+        attn_probs_from_global_key = tf.einsum("blhd,bshd->blhs", query_vectors, key_vectors_only_global)
+
+        # (batch_size, max_num_global_attn_indices, seq_len, num_heads)
+        attn_probs_from_global_key_trans = tf.transpose(attn_probs_from_global_key, (0, 3, 1, 2))
+        mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple(
+            shape_list(attn_probs_from_global_key_trans)[-2:]
+        )
+        mask = tf.ones(mask_shape) * -10000.0
+        mask = tf.cast(mask, dtype=attn_probs_from_global_key_trans.dtype)
+
+        # scatter mask
+        attn_probs_from_global_key_trans = tf.tensor_scatter_nd_update(
+            attn_probs_from_global_key_trans,
+            is_local_index_no_global_attn_nonzero,
+            mask,
+        )
+
+        # (batch_size, seq_len, num_heads, max_num_global_attn_indices)
+        attn_probs_from_global_key = tf.transpose(attn_probs_from_global_key_trans, (0, 2, 3, 1))
+
+        # concat to attn_probs
+        # (batch_size, seq_len, num_heads, extra attention count + 2*window+1)
+        attn_scores = tf.concat((attn_probs_from_global_key, attn_scores), axis=-1)
+
+        return attn_scores
+
+    def _compute_attn_output_with_global_indices(
+        self,
+        value_vectors,
+        attn_probs,
+        max_num_global_attn_indices,
+        is_index_global_attn_nonzero,
+        is_local_index_global_attn_nonzero,
+    ):
+        batch_size = shape_list(attn_probs)[0]
+
+        # cut local attn probs to global only
+        attn_probs_only_global = attn_probs[:, :, :, :max_num_global_attn_indices]
+
+        # select global value vectors
+        global_value_vectors = tf.gather_nd(value_vectors, is_index_global_attn_nonzero)
+
+        # create only global value vectors
+        value_vectors_only_global = tf.scatter_nd(
+            is_local_index_global_attn_nonzero,
+            global_value_vectors,
+            shape=(
+                batch_size,
+                max_num_global_attn_indices,
+                self.num_heads,
+                self.head_dim,
+            ),
+        )
+
+        # compute attn output only global
+        attn_output_only_global = tf.einsum("blhs,bshd->blhd", attn_probs_only_global, value_vectors_only_global)
+
+        # reshape attn probs
+        attn_probs_without_global = attn_probs[:, :, :, max_num_global_attn_indices:]
+
+        # compute attn output with global
+        attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
+            attn_probs_without_global, value_vectors, self.one_sided_attn_window_size
+        )
+
+        return attn_output_only_global + attn_output_without_global
+
+    def _compute_global_attn_output_from_hidden(
+        self,
+        attn_output,
+        hidden_states,
+        max_num_global_attn_indices,
+        layer_head_mask,
+        is_local_index_global_attn_nonzero,
+        is_index_global_attn_nonzero,
+        is_local_index_no_global_attn_nonzero,
+        is_index_masked,
+        training,
+    ):
+        batch_size, seq_len = shape_list(hidden_states)[:2]
+
+        # prepare global hidden states
+        global_attn_hidden_states = tf.gather_nd(hidden_states, is_index_global_attn_nonzero)
+        global_attn_hidden_states = tf.scatter_nd(
+            is_local_index_global_attn_nonzero,
+            global_attn_hidden_states,
+            shape=(batch_size, max_num_global_attn_indices, self.embed_dim),
+        )
+
+        # global key, query, value
+        global_query_vectors_only_global = self.query_global(global_attn_hidden_states)
+        global_key_vectors = self.key_global(hidden_states)
+        global_value_vectors = self.value_global(hidden_states)
+
+        # normalize
+        global_query_vectors_only_global /= tf.math.sqrt(
+            tf.cast(self.head_dim, dtype=global_query_vectors_only_global.dtype)
+        )
+        global_query_vectors_only_global = self.reshape_and_transpose(global_query_vectors_only_global, batch_size)
+        global_key_vectors = self.reshape_and_transpose(global_key_vectors, batch_size)
+        global_value_vectors = self.reshape_and_transpose(global_value_vectors, batch_size)
+
+        # compute attn scores
+        global_attn_scores = tf.matmul(global_query_vectors_only_global, global_key_vectors, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(global_attn_scores),
+            [batch_size * self.num_heads, max_num_global_attn_indices, seq_len],
+            message=(
+                "global_attn_scores have the wrong size. Size should be"
+                f" {(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)}, but is"
+                f" {shape_list(global_attn_scores)}."
+            ),
+        )
+
+        global_attn_scores = tf.reshape(
+            global_attn_scores,
+            (batch_size, self.num_heads, max_num_global_attn_indices, seq_len),
+        )
+        global_attn_scores_trans = tf.transpose(global_attn_scores, (0, 2, 1, 3))
+        mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple(
+            shape_list(global_attn_scores_trans)[-2:]
+        )
+        global_attn_mask = tf.ones(mask_shape) * -10000.0
+        global_attn_mask = tf.cast(global_attn_mask, dtype=global_attn_scores_trans.dtype)
+
+        # scatter mask
+        global_attn_scores_trans = tf.tensor_scatter_nd_update(
+            global_attn_scores_trans,
+            is_local_index_no_global_attn_nonzero,
+            global_attn_mask,
+        )
+        global_attn_scores = tf.transpose(global_attn_scores_trans, (0, 2, 1, 3))
+
+        # mask global attn scores
+        attn_mask = tf.tile(is_index_masked[:, None, None, :], (1, shape_list(global_attn_scores)[1], 1, 1))
+        global_attn_scores = tf.where(attn_mask, -10000.0, global_attn_scores)
+        global_attn_scores = tf.reshape(
+            global_attn_scores,
+            (batch_size * self.num_heads, max_num_global_attn_indices, seq_len),
+        )
+
+        # compute global attn probs
+        global_attn_probs_float = stable_softmax(global_attn_scores, axis=-1)
+
+        # apply layer head masking
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+            global_attn_probs_float = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
+                global_attn_probs_float, (batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
+            )
+            global_attn_probs_float = tf.reshape(
+                global_attn_probs_float, (batch_size * self.num_heads, max_num_global_attn_indices, seq_len)
+            )
+
+        # dropout
+        global_attn_probs = self.global_dropout(global_attn_probs_float, training=training)
+
+        # global attn output
+        global_attn_output = tf.matmul(global_attn_probs, global_value_vectors)
+
+        tf.debugging.assert_equal(
+            shape_list(global_attn_output),
+            [batch_size * self.num_heads, max_num_global_attn_indices, self.head_dim],
+            message=(
+                "global_attn_output tensor has the wrong size. Size should be"
+                f" {(batch_size * self.num_heads, max_num_global_attn_indices, self.head_dim)}, but is"
+                f" {shape_list(global_attn_output)}."
+            ),
+        )
+
+        global_attn_output = tf.reshape(
+            global_attn_output,
+            (batch_size, self.num_heads, max_num_global_attn_indices, self.head_dim),
+        )
+
+        # get only non zero global attn output
+        nonzero_global_attn_output = tf.gather_nd(
+            tf.transpose(global_attn_output, (0, 2, 1, 3)),
+            is_local_index_global_attn_nonzero,
+        )
+        nonzero_global_attn_output = tf.reshape(
+            nonzero_global_attn_output,
+            (shape_list(is_local_index_global_attn_nonzero)[0], -1),
+        )
+
+        # overwrite values with global attention
+        attn_output = tf.tensor_scatter_nd_update(
+            attn_output, is_index_global_attn_nonzero, nonzero_global_attn_output
+        )
+
+        global_attn_probs = tf.reshape(
+            global_attn_probs, (batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
+        )
+
+        return attn_output, global_attn_probs
+
+    def reshape_and_transpose(self, vector, batch_size):
+        return tf.reshape(
+            tf.transpose(
+                tf.reshape(vector, (batch_size, -1, self.num_heads, self.head_dim)),
+                (0, 2, 1, 3),
+            ),
+            (batch_size * self.num_heads, -1, self.head_dim),
+        )
+
+
+class TFLEDEncoderAttention(keras.layers.Layer):
+    def __init__(self, config, layer_id, **kwargs):
+        super().__init__(**kwargs)
+        self.longformer_self_attn = TFLEDEncoderSelfAttention(config, layer_id=layer_id, name="longformer_self_attn")
+        self.output_dense = keras.layers.Dense(config.d_model, use_bias=True, name="output")
+        self.config = config
+
+    def call(self, inputs, training=False):
+        (
+            hidden_states,
+            attention_mask,
+            layer_head_mask,
+            is_index_masked,
+            is_index_global_attn,
+            is_global_attn,
+        ) = inputs
+
+        self_outputs = self.longformer_self_attn(
+            [hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn],
+            training=training,
+        )
+
+        attention_output = self.output_dense(self_outputs[0], training=training)
+        outputs = (attention_output,) + self_outputs[1:]
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "longformer_self_attn", None) is not None:
+            with tf.name_scope(self.longformer_self_attn.name):
+                self.longformer_self_attn.build(None)
+        if getattr(self, "output_dense", None) is not None:
+            with tf.name_scope(self.output_dense.name):
+                self.output_dense.build([None, None, self.config.d_model])
+
+
+class TFLEDDecoderAttention(keras.layers.Layer):
+    """Multi-headed attention from "Attention Is All You Need"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = keras.layers.Dropout(dropout)
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
+        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
+        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
+        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: tuple[tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        training=False,
+    ) -> tuple[tf.Tensor, tf.Tensor | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {shape_list(attn_weights)}"
+            ),
+        )
+
+        if attention_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask),
+                [bsz, 1, tgt_len, src_len],
+                message=(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {shape_list(attention_mask)}"
+                ),
+            )
+
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + tf.cast(
+                attention_mask, dtype=attn_weights.dtype
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+
+            attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
+                attn_weights, (bsz, self.num_heads, tgt_len, src_len)
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_probs = self.dropout(attn_weights, training=training)
+
+        attn_output = tf.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {shape_list(attn_output)}"
+            ),
+        )
+
+        attn_output = tf.transpose(
+            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+        )
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+class TFLEDEncoderLayer(keras.layers.Layer):
+    def __init__(self, config: LEDConfig, layer_id: int, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFLEDEncoderAttention(config, layer_id, name="self_attn")
+        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation_fn = get_tf_activation(config.activation_function)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
+        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        layer_head_mask: tf.Tensor,
+        is_index_masked: tf.Tensor,
+        is_index_global_attn: tf.Tensor,
+        is_global_attn: bool,
+        training=False,
+    ):
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`tf.Tensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+                *(config.encoder_attention_heads,)*.
+        """
+        residual = hidden_states
+        layer_outputs = self.self_attn(
+            [hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn],
+            training=training,
+        )
+
+        hidden_states = layer_outputs[0]
+
+        tf.debugging.assert_equal(
+            shape_list(hidden_states),
+            shape_list(residual),
+            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
+        )
+
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return (hidden_states,) + layer_outputs[1:]
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.encoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+
+
+class TFLEDDecoderLayer(keras.layers.Layer):
+    def __init__(self, config: LEDConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFLEDDecoderAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="self_attn",
+            is_decoder=True,
+        )
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation_fn = get_tf_activation(config.activation_function)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+
+        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.encoder_attn = TFLEDDecoderAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="encoder_attn",
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
+        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
+        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        encoder_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: tuple[tf.Tensor] | None = None,
+        training=False,
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor, tuple[tuple[tf.Tensor]]]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`tf.Tensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`tf.Tensor`):
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+                *(config.encoder_attention_heads,)*.
+            encoder_layer_head_mask (`tf.Tensor`): mask for encoder attention heads in a given layer of
+                size *(config.encoder_attention_heads,)*.
+            past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
+        """
+        residual = hidden_states
+
+        # Self-Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=encoder_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+            )
+            hidden_states = self.dropout(hidden_states, training=training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return (
+            hidden_states,
+            self_attn_weights,
+            cross_attn_weights,
+            present_key_value,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "encoder_attn", None) is not None:
+            with tf.name_scope(self.encoder_attn.name):
+                self.encoder_attn.build(None)
+        if getattr(self, "encoder_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.encoder_attn_layer_norm.name):
+                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.decoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+
+
+class TFLEDPreTrainedModel(TFPreTrainedModel):
+    config_class = LEDConfig
+    base_model_prefix = "led"
+
+    @property
+    def input_signature(self):
+        sig = super().input_signature
+        sig["global_attention_mask"] = tf.TensorSpec((None, None), tf.int32, name="global_attention_mask")
+        return sig
+
+
+@dataclass
+# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput with TFLongformer->TFLEDEncoder
+class TFLEDEncoderBaseModelOutput(ModelOutput):
+    """
+    Base class for Longformer's outputs, with potential hidden states, local and global attentions.
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
+            attention_window + 1)`, where `x` is the number of tokens with global attention mask.
+
+            Local attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token in the sequence to every token with
+            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
+            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
+            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
+            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
+            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
+            If the attention window contains a token with global attention, the attention weight at the corresponding
+            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
+            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
+            accessed from `global_attentions`.
+        global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x`
+            is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    last_hidden_state: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+    attentions: tuple[tf.Tensor, ...] | None = None
+    global_attentions: tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFLEDSeq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
+            sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        encoder_global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x`
+            is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    last_hidden_state: tf.Tensor | None = None
+    past_key_values: list[tf.Tensor] | None = None
+    decoder_hidden_states: tuple[tf.Tensor, ...] | None = None
+    decoder_attentions: tuple[tf.Tensor, ...] | None = None
+    cross_attentions: tuple[tf.Tensor, ...] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: tuple[tf.Tensor, ...] | None = None
+    encoder_attentions: tuple[tf.Tensor, ...] | None = None
+    encoder_global_attentions: tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFLEDSeq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
+            sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        encoder_global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x`
+            is the number of tokens with global attention mask.
+
+            Global attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads. Those are the attention weights from every token with global attention to every token
+            in the sequence.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: tf.Tensor | None = None
+    past_key_values: list[tf.Tensor] | None = None
+    decoder_hidden_states: tuple[tf.Tensor, ...] | None = None
+    decoder_attentions: tuple[tf.Tensor, ...] | None = None
+    cross_attentions: tuple[tf.Tensor, ...] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: tuple[tf.Tensor, ...] | None = None
+    encoder_attentions: tuple[tf.Tensor, ...] | None = None
+    encoder_global_attentions: tuple[tf.Tensor, ...] | None = None
+
+
+LED_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`LEDConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+LED_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+        decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
+        head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tf.Tensor`, *optional*):
+            hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@keras_serializable
+class TFLEDEncoder(keras.layers.Layer):
+    config_class = LEDConfig
+    """
+    Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
+    [`TFLEDEncoderLayer`].
+
+    Args:
+        config: LEDConfig
+    """
+
+    def __init__(self, config: LEDConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.dropout = keras.layers.Dropout(config.dropout)
+        if config.encoder_layerdrop > 0:
+            logger.warning("Layerdrop is currently disabled in TFLED models.")
+        self.layerdrop = 0.0
+        self.padding_idx = config.pad_token_id
+
+        if isinstance(config.attention_window, int):
+            assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
+            assert config.attention_window > 0, "`config.attention_window` has to be positive"
+            config.attention_window = [config.attention_window] * config.num_hidden_layers  # one value per layer
+        else:
+            assert len(config.attention_window) == config.num_hidden_layers, (
+                "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
+                f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
+            )
+
+        self.attention_window = config.attention_window
+        self.embed_tokens = embed_tokens
+        self.embed_positions = TFLEDLearnedPositionalEmbedding(
+            config.max_encoder_position_embeddings,
+            config.d_model,
+            name="embed_positions",
+        )
+        self.layers = [TFLEDEncoderLayer(config, i, name=f"layers.{i}") for i in range(config.encoder_layers)]
+        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
+        self.embed_dim = config.d_model
+
+    def get_embed_tokens(self):
+        return self.embed_tokens
+
+    def set_embed_tokens(self, embed_tokens):
+        self.embed_tokens = embed_tokens
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        """
+        Args:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`tf.Tensor` of shape `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
+            inputs_embeds = self.embed_tokens(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+
+        # merge `global_attention_mask` and `attention_mask`
+        if global_attention_mask is not None:
+            attention_mask = attention_mask * tf.cast((global_attention_mask + 1), dtype=attention_mask.dtype)
+
+        padding_len, input_ids, attention_mask, inputs_embeds = self._pad_to_window_size(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            pad_token_id=self.padding_idx,
+        )
+
+        input_shape = shape_list(attention_mask)
+        # is index masked or global attention
+        is_index_masked = tf.math.less(tf.cast(attention_mask, tf.int8), 1)
+        is_index_global_attn = tf.math.greater(tf.cast(attention_mask, tf.int8), 1)
+        is_global_attn = tf.math.reduce_any(is_index_global_attn)
+
+        embed_pos = self.embed_positions(input_shape)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        # check attention mask and invert
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask)[:, 0, 0, :]
+            attention_mask = attention_mask[:, :, None, None]
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = all_global_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(head_mask)[0],
+                len(self.layers),
+                message=(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {shape_list(head_mask)[0]}."
+                ),
+            )
+
+        # encoder layers
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                hidden_states_to_add = self.compute_hidden_states(hidden_states, padding_len)
+                encoder_states = encoder_states + (hidden_states_to_add,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if training and (dropout_probability < self.layerdrop):  # skip the layer
+                continue
+
+            layer_outputs = encoder_layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                layer_head_mask=head_mask[idx] if head_mask is not None else None,
+                is_index_masked=is_index_masked,
+                is_index_global_attn=is_index_global_attn,
+                is_global_attn=is_global_attn,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                # bzs x seq_len x num_attn_heads x (num_global_attn + attention_window_len + 1) => bzs x num_attn_heads x seq_len x (num_global_attn + attention_window_len + 1)
+                all_attentions = all_attentions + (tf.transpose(layer_outputs[1], (0, 2, 1, 3)),)
+
+                # bzs x num_attn_heads x num_global_attn x seq_len => bzs x num_attn_heads x seq_len x num_global_attn
+                all_global_attentions = all_global_attentions + (tf.transpose(layer_outputs[2], (0, 1, 3, 2)),)
+
+        # undo padding
+        # unpad `hidden_states` because the calling function is expecting a length == input_ids.size(1)
+        hidden_states = self.compute_hidden_states(hidden_states, padding_len)
+
+        # undo padding
+        if output_attentions:
+            all_attentions = (
+                tuple(state[:, :, :-padding_len, :] for state in all_attentions) if padding_len > 0 else all_attentions
+            )
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFLEDEncoderBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+            global_attentions=all_global_attentions,
+        )
+
+    @tf.function
+    def compute_hidden_states(self, hidden_states, padding_len):
+        return hidden_states[:, :-padding_len] if padding_len > 0 else hidden_states
+
+    def _pad_to_window_size(
+        self,
+        input_ids,
+        attention_mask,
+        inputs_embeds,
+        pad_token_id,
+    ):
+        """A helper function to pad tokens and mask to work with implementation of Longformer selfattention."""
+        # padding
+        attention_window = (
+            self.attention_window if isinstance(self.attention_window, int) else max(self.attention_window)
+        )
+
+        assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"
+
+        input_shape = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)
+        batch_size, seq_len = input_shape[:2]
+        padding_len = (attention_window - seq_len % attention_window) % attention_window
+
+        if padding_len > 0:
+            logger.warning_once(
+                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
+                f"`config.attention_window`: {attention_window}"
+            )
+
+        paddings = tf.convert_to_tensor([[0, 0], [0, padding_len]])
+
+        if input_ids is not None:
+            input_ids = tf.pad(input_ids, paddings, constant_values=pad_token_id)
+
+        if inputs_embeds is not None:
+            if padding_len > 0:
+                input_ids_padding = tf.fill((batch_size, padding_len), pad_token_id)
+                inputs_embeds_padding = self.embed_tokens(input_ids_padding)
+                inputs_embeds = tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
+
+        attention_mask = tf.pad(attention_mask, paddings, constant_values=False)  # no attention on the padding tokens
+
+        return (
+            padding_len,
+            input_ids,
+            attention_mask,
+            inputs_embeds,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layernorm_embedding", None) is not None:
+            with tf.name_scope(self.layernorm_embedding.name):
+                self.layernorm_embedding.build([None, None, self.embed_dim])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFLEDDecoder(keras.layers.Layer):
+    config_class = LEDConfig
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFLEDDecoderLayer`]
+
+    Args:
+        config: LEDConfig
+        embed_tokens: output embedding
+    """
+
+    def __init__(self, config: LEDConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.embed_tokens = embed_tokens
+        if config.decoder_layerdrop > 0:
+            logger.warning("Layerdrop is currently disabled in TFLED models.")
+        self.layerdrop = 0.0
+        self.embed_positions = TFLEDLearnedPositionalEmbedding(
+            config.max_decoder_position_embeddings,
+            config.d_model,
+            name="embed_positions",
+        )
+        self.layers = [TFLEDDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
+        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
+
+        self.dropout = keras.layers.Dropout(config.dropout)
+
+    def set_embed_tokens(self, embed_tokens):
+        self.embed_tokens = embed_tokens
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        encoder_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        r"""
+        Args:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            encoder_head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding. If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+                inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        else:
+            combined_attention_mask = _expand_mask(
+                tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
+            )
+
+        if attention_mask is not None and input_shape[-1] > 1:
+            combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1])
+
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
+
+        hidden_states = self.layernorm_embedding(hidden_states + positions)
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        # decoder layers
+        all_hidden_states = ()
+        all_self_attns = ()
+        all_cross_attentions = ()
+        present_key_values = ()
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(head_mask)[0],
+                len(self.layers),
+                message=(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {shape_list(head_mask)[0]}."
+                ),
+            )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+
+            if training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer(
+                hidden_states,
+                attention_mask=combined_attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=head_mask[idx] if head_mask is not None else None,
+                encoder_layer_head_mask=encoder_head_mask[idx] if encoder_head_mask is not None else None,
+                past_key_value=past_key_value,
+            )
+
+            if use_cache:
+                present_key_values += (present_key_value,)
+
+            if output_attentions:
+                all_self_attns += (layer_self_attn,)
+                all_cross_attentions += (layer_cross_attn,)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        else:
+            all_hidden_states = None
+
+        all_self_attns = all_self_attns if output_attentions else None
+        all_cross_attentions = all_cross_attentions if output_attentions else None
+
+        present_key_values = present_key_values if use_cache else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        else:
+            return TFBaseModelOutputWithPastAndCrossAttentions(
+                last_hidden_state=hidden_states,
+                past_key_values=present_key_values,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+                cross_attentions=all_cross_attentions,
+            )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layernorm_embedding", None) is not None:
+            with tf.name_scope(self.layernorm_embedding.name):
+                self.layernorm_embedding.build([None, None, self.config.d_model])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFLEDMainLayer(keras.layers.Layer):
+    config_class = LEDConfig
+
+    def __init__(self, config: LEDConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.shared = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.d_model,
+            embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
+            name="led.shared",
+        )
+        # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
+        self.shared.load_weight_prefix = "led.shared"
+
+        self.encoder = TFLEDEncoder(config, self.shared, name="encoder")
+        self.decoder = TFLEDDecoder(config, self.shared, name="decoder")
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        encoder_outputs: tuple | TFLEDEncoderBaseModelOutput | None = None,
+        global_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+        **kwargs,
+    ):
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            use_cache = False
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                global_attention_mask=global_attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                training=training,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a TFLEDEncoderBaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, TFLEDEncoderBaseModelOutput):
+            encoder_outputs = TFLEDEncoderBaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # If the user passed a TFLEDEncoderBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False
+        elif not return_dict and not isinstance(encoder_outputs, tuple):
+            encoder_outputs = encoder_outputs.to_tuple()
+
+        decoder_outputs = self.decoder(
+            decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            encoder_head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return TFLEDSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            encoder_global_attentions=encoder_outputs.global_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        # The shared/tied weights expect to be in the model base namespace
+        # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
+        # the current one.
+        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
+            self.shared.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+
+
+@add_start_docstrings(
+    "The bare LED Model outputting raw hidden-states without any specific head on top.",
+    LED_START_DOCSTRING,
+)
+class TFLEDModel(TFLEDPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.led = TFLEDMainLayer(config, name="led")
+
+    def get_encoder(self):
+        return self.led.encoder
+
+    def get_decoder(self):
+        return self.led.decoder
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFLEDSeq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        encoder_outputs: tf.Tensor | None = None,
+        global_attention_mask: tf.Tensor | None = None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> tuple[tf.Tensor] | TFLEDSeq2SeqModelOutput:
+        outputs = self.led(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def serving_output(self, output):
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
+        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
+        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
+        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+        enc_g_attns = tf.convert_to_tensor(output.encoder_global_attentions) if self.config.output_attentions else None
+
+        return TFLEDSeq2SeqModelOutput(
+            last_hidden_state=output.last_hidden_state,
+            past_key_values=pkv,
+            decoder_hidden_states=dec_hs,
+            decoder_attentions=dec_attns,
+            cross_attentions=cross_attns,
+            encoder_last_hidden_state=output.encoder_last_hidden_state,
+            encoder_hidden_states=enc_hs,
+            encoder_attentions=enc_attns,
+            encoder_global_attentions=enc_g_attns,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "led", None) is not None:
+            with tf.name_scope(self.led.name):
+                self.led.build(None)
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
+class BiasLayer(keras.layers.Layer):
+    """
+    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
+    so all weights have to be registered in a layer.
+    """
+
+    def __init__(self, shape, initializer, trainable, name, **kwargs):
+        super().__init__(name=name, **kwargs)
+        # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
+        # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
+        # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
+        self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
+
+    def call(self, x):
+        return x + self.bias
+
+
+@add_start_docstrings(
+    "The LED Model with a language modeling head. Can be used for summarization.",
+    LED_START_DOCSTRING,
+)
+class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [
+        r"led.encoder.embed_tokens.weight",
+        r"led.decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.led = TFLEDMainLayer(config, name="led")
+        self.use_cache = config.use_cache
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
+        self.bias_layer = BiasLayer(
+            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
+        )
+
+        # TODO (Joao): investigate why LED has numerical issues in XLA generate
+        self.supports_xla_generation = False
+
+    def get_decoder(self):
+        return self.led.decoder
+
+    def get_encoder(self):
+        return self.led.encoder
+
+    def get_bias(self):
+        return {"final_logits_bias": self.bias_layer.bias}
+
+    def set_bias(self, value):
+        # Replaces the existing layers containing bias for correct (de)serialization.
+        vocab_size = value["final_logits_bias"].shape[-1]
+        self.bias_layer = BiasLayer(
+            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
+        )
+        self.bias_layer.bias.assign(value["final_logits_bias"])
+
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
+
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFLEDSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: TFLEDEncoderBaseModelOutput | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFLEDSeq2SeqLMOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, TFLEDForConditionalGeneration
+        >>> import tensorflow as tf
+
+        >>> mname = "allenai/led-base-16384"
+        >>> tokenizer = AutoTokenizer.from_pretrained(mname)
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+        >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
+        >>> batch = tokenizer([TXT], return_tensors="tf")
+        >>> logits = model(inputs=batch.input_ids).logits
+        >>> probs = tf.nn.softmax(logits[0])
+        >>> # probs[5] is associated with the mask token
+        ```"""
+
+        if labels is not None:
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.led(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        lm_logits = tf.matmul(outputs[0], self.led.shared.weights, transpose_b=True)
+        lm_logits = self.bias_layer(lm_logits)
+        masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return TFLEDSeq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,  # index 1 of d outputs
+            decoder_hidden_states=outputs.decoder_hidden_states,  # index 2 of d outputs
+            decoder_attentions=outputs.decoder_attentions,  # index 3 of d outputs
+            cross_attentions=outputs.cross_attentions,  # index 4 of d outputs
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,  # index 0 of encoder outputs
+            encoder_hidden_states=outputs.encoder_hidden_states,  # 1 of e out
+            encoder_attentions=outputs.encoder_attentions,  # 2 of e out
+            encoder_global_attentions=outputs.encoder_global_attentions,
+        )
+
+    def serving_output(self, output):
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
+        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
+        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
+        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+        enc_g_attns = tf.convert_to_tensor(output.encoder_global_attentions) if self.config.output_attentions else None
+
+        return TFLEDSeq2SeqLMOutput(
+            logits=output.logits,
+            past_key_values=pkv,
+            decoder_hidden_states=dec_hs,
+            decoder_attentions=dec_attns,
+            cross_attentions=cross_attns,
+            encoder_last_hidden_state=output.encoder_last_hidden_state,
+            encoder_hidden_states=enc_hs,
+            encoder_attentions=enc_attns,
+            encoder_global_attentions=enc_g_attns,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def hf_compute_loss(self, labels, logits):
+        """CrossEntropyLoss that ignores pad tokens"""
+        loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
+        if self.config.tf_legacy_loss:
+            melted_labels = tf.reshape(labels, (-1,))
+            active_loss = tf.not_equal(melted_labels, self.config.pad_token_id)
+            reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
+            labels = tf.boolean_mask(melted_labels, active_loss)
+            return loss_fn(labels, reduced_logits)
+
+        # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
+        unmasked_loss = loss_fn(tf.nn.relu(labels), logits)
+        # make sure only non-padding labels affect the loss
+        loss_mask = tf.cast(labels != self.config.pad_token_id, dtype=unmasked_loss.dtype)
+        masked_loss = unmasked_loss * loss_mask
+        reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask)
+        return tf.reshape(reduced_masked_loss, (1,))
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "led", None) is not None:
+            with tf.name_scope(self.led.name):
+                self.led.build(None)
+        if getattr(self, "bias_layer", None) is not None:
+            with tf.name_scope(self.bias_layer.name):
+                self.bias_layer.build(None)
+
+
+__all__ = ["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/led/tokenization_led.py b/phivenv/Lib/site-packages/transformers/models/led/tokenization_led.py
new file mode 100644
index 0000000000000000000000000000000000000000..d110ac30d969e1958a872e8bdc186df0822a25b5
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/led/tokenization_led.py
@@ -0,0 +1,454 @@
+# coding=utf-8
+# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for LED."""
+
+import json
+import os
+from functools import lru_cache
+from typing import Optional, Union
+
+import regex as re
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import BatchEncoding, EncodedInput
+from ...utils import PaddingStrategy, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+
+# See all LED models at https://huggingface.co/models?filter=LED
+
+
+@lru_cache
+# Copied from transformers.models.bart.tokenization_bart.bytes_to_unicode
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+# Copied from transformers.models.bart.tokenization_bart.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class LEDTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a LED tokenizer, which is smilar to the ROBERTa tokenizer, using byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import LEDTokenizer
+
+    >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
+    >>> tokenizer("Hello world")["input_ids"]
+    [0, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [0, 20920, 232, 2]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (BART tokenizer detect beginning of words by the preceding space).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.__init__
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+    @property
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.encoder)
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.bpe
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer._tokenize
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.build_inputs_with_special_tokens with BART->LED
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A LED sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.create_token_type_ids_from_sequences with BART->LED
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. LED does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.prepare_for_tokenization
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
+
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        encoded_inputs = super()._pad(
+            encoded_inputs=encoded_inputs,
+            max_length=max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+        )
+
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        if return_attention_mask and "global_attention_mask" in encoded_inputs:
+            required_input = encoded_inputs[self.model_input_names[0]]
+            # `global_attention_mask` need to have the same length as other (sequential) inputs.
+            needs_to_be_padded = len(encoded_inputs["global_attention_mask"]) != len(required_input)
+
+            if needs_to_be_padded:
+                difference = len(required_input) - len(encoded_inputs["global_attention_mask"])
+
+                if self.padding_side == "right":
+                    # Use `-1` since `0` in `global_attention_mask` means `local attention` instead of `not to attend`
+                    encoded_inputs["global_attention_mask"] = (
+                        encoded_inputs["global_attention_mask"] + [-1] * difference
+                    )
+                elif self.padding_side == "left":
+                    encoded_inputs["global_attention_mask"] = [-1] * difference + encoded_inputs[
+                        "global_attention_mask"
+                    ]
+                else:
+                    raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+        return encoded_inputs
+
+
+__all__ = ["LEDTokenizer"]
diff --git a/phivenv/Lib/site-packages/transformers/models/led/tokenization_led_fast.py b/phivenv/Lib/site-packages/transformers/models/led/tokenization_led_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..baea10f23516fef914d70fef87bd3d17bce19baa
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/led/tokenization_led_fast.py
@@ -0,0 +1,322 @@
+# coding=utf-8
+# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for LED."""
+
+import json
+from typing import Optional, Union
+
+from tokenizers import processors
+
+from ...tokenization_utils_base import AddedToken, BatchEncoding, EncodedInput
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import PaddingStrategy, logging
+from .tokenization_led import LEDTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class LEDTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" LED tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer,
+    using byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import LEDTokenizerFast
+
+    >>> tokenizer = LEDTokenizerFast.from_pretrained("allenai/led-base-16384")
+    >>> tokenizer("Hello world")["input_ids"]
+    [0, 31414, 232, 2]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [0, 20920, 232, 2]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (LED tokenizer detect beginning of words by the preceding space).
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether the post processing step should trim offsets to avoid including whitespaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = LEDTokenizer
+    model_input_names = ["input_ids", "attention_mask"]
+
+    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.__init__
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        trim_offsets=True,
+        **kwargs,
+    ):
+        # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, normalized=True, special=True)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
+            **kwargs,
+        )
+
+        # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
+        tokenizer_component = "post_processor"
+        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
+        if tokenizer_component_instance:
+            state = json.loads(tokenizer_component_instance.__getstate__())
+
+            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
+            if "sep" in state:
+                state["sep"] = tuple(state["sep"])
+            if "cls" in state:
+                state["cls"] = tuple(state["cls"])
+
+            changes_to_apply = False
+
+            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+                state["add_prefix_space"] = add_prefix_space
+                changes_to_apply = True
+
+            if state.get("trim_offsets", trim_offsets) != trim_offsets:
+                state["trim_offsets"] = trim_offsets
+                changes_to_apply = True
+
+            if changes_to_apply:
+                component_class = getattr(processors, state.pop("type"))
+                new_value = component_class(**state)
+                setattr(self.backend_tokenizer, tokenizer_component, new_value)
+
+    @property
+    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.mask_token with BART->LED
+    def mask_token(self) -> str:
+        """
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
+        having been set.
+
+        LED tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
+        comprise the space before the *<mask>*.
+        """
+        if self._mask_token is None:
+            if self.verbose:
+                logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+
+    @mask_token.setter
+    def mask_token(self, value):
+        """
+        Overriding the default behavior of the mask token to have it eat the space before it.
+
+        This is needed to preserve backward compatibility with all the previously used models based on LED.
+        """
+        # Mask token behave like a normal word, i.e. include the space before it
+        # So we set lstrip to True
+        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
+        self._mask_token = value
+
+    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast._batch_encode_plus
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        if is_split_into_words and not self.add_prefix_space:
+            raise ValueError(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+                "to use it with pretokenized inputs."
+            )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast._encode_plus
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        if is_split_into_words and not self.add_prefix_space:
+            raise ValueError(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+                "to use it with pretokenized inputs."
+            )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.create_token_type_ids_from_sequences with BART->LED
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. LED does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    # Copied from transformers.models.led.tokenization_led.LEDTokenizer._pad
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        encoded_inputs = super()._pad(
+            encoded_inputs=encoded_inputs,
+            max_length=max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+        )
+
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        if return_attention_mask and "global_attention_mask" in encoded_inputs:
+            required_input = encoded_inputs[self.model_input_names[0]]
+            # `global_attention_mask` need to have the same length as other (sequential) inputs.
+            needs_to_be_padded = len(encoded_inputs["global_attention_mask"]) != len(required_input)
+
+            if needs_to_be_padded:
+                difference = len(required_input) - len(encoded_inputs["global_attention_mask"])
+
+                if self.padding_side == "right":
+                    # Use `-1` since `0` in `global_attention_mask` means `local attention` instead of `not to attend`
+                    encoded_inputs["global_attention_mask"] = (
+                        encoded_inputs["global_attention_mask"] + [-1] * difference
+                    )
+                elif self.padding_side == "left":
+                    encoded_inputs["global_attention_mask"] = [-1] * difference + encoded_inputs[
+                        "global_attention_mask"
+                    ]
+                else:
+                    raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+        return encoded_inputs
+
+
+__all__ = ["LEDTokenizerFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/__init__.py b/phivenv/Lib/site-packages/transformers/models/levit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ae097b66a7f23a97cc24eda5ae80051bcd475c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/levit/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_levit import *
+    from .feature_extraction_levit import *
+    from .image_processing_levit import *
+    from .image_processing_levit_fast import *
+    from .modeling_levit import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97a952476c126d5e46dc2e00d9082efd24672b28
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/configuration_levit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/configuration_levit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c45610f1f36f78a551fdde5f89699b848e2d313e
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/configuration_levit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/feature_extraction_levit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/feature_extraction_levit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1adee06bb844118d8776b6fe345acd7baa3b21c3
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/feature_extraction_levit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/image_processing_levit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/image_processing_levit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01b038f9d7acfd8aced8b8aa6911a05461481095
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/image_processing_levit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/image_processing_levit_fast.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/image_processing_levit_fast.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..789323c684613cada90fc155bb994a03ecc2069f
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/image_processing_levit_fast.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/modeling_levit.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/modeling_levit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f918c3a5004218d92d7a80dc57029f729d239115
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/levit/__pycache__/modeling_levit.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/configuration_levit.py b/phivenv/Lib/site-packages/transformers/models/levit/configuration_levit.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0d63ed8e37177e0779fe3d22b027a69f1f80838
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/levit/configuration_levit.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LeViT model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class LevitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LevitModel`]. It is used to instantiate a LeViT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LeViT
+    [facebook/levit-128S](https://huggingface.co/facebook/levit-128S) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size of the input image.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size for the initial convolution layers of patch embedding.
+        stride (`int`, *optional*, defaults to 2):
+            The stride size for the initial convolution layers of patch embedding.
+        padding (`int`, *optional*, defaults to 1):
+            The padding size for the initial convolution layers of patch embedding.
+        patch_size (`int`, *optional*, defaults to 16):
+            The patch size for embeddings.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[128, 256, 384]`):
+            Dimension of each of the encoder blocks.
+        num_attention_heads (`list[int]`, *optional*, defaults to `[4, 8, 12]`):
+            Number of attention heads for each attention layer in each block of the Transformer encoder.
+        depths (`list[int]`, *optional*, defaults to `[4, 4, 4]`):
+            The number of layers in each encoder block.
+        key_dim (`list[int]`, *optional*, defaults to `[16, 16, 16]`):
+            The size of key in each of the encoder blocks.
+        drop_path_rate (`int`, *optional*, defaults to 0):
+            The dropout probability for stochastic depths, used in the blocks of the Transformer encoder.
+        mlp_ratios (`list[int]`, *optional*, defaults to `[2, 2, 2]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        attention_ratios (`list[int]`, *optional*, defaults to `[2, 2, 2]`):
+            Ratio of the size of the output dimension compared to input dimension of attention layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import LevitConfig, LevitModel
+
+    >>> # Initializing a LeViT levit-128S style configuration
+    >>> configuration = LevitConfig()
+
+    >>> # Initializing a model (with random weights) from the levit-128S style configuration
+    >>> model = LevitModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "levit"
+
+    def __init__(
+        self,
+        image_size=224,
+        num_channels=3,
+        kernel_size=3,
+        stride=2,
+        padding=1,
+        patch_size=16,
+        hidden_sizes=[128, 256, 384],
+        num_attention_heads=[4, 8, 12],
+        depths=[4, 4, 4],
+        key_dim=[16, 16, 16],
+        drop_path_rate=0,
+        mlp_ratio=[2, 2, 2],
+        attention_ratio=[2, 2, 2],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.hidden_sizes = hidden_sizes
+        self.num_attention_heads = num_attention_heads
+        self.depths = depths
+        self.key_dim = key_dim
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = patch_size
+        self.attention_ratio = attention_ratio
+        self.mlp_ratio = mlp_ratio
+        self.initializer_range = initializer_range
+        self.down_ops = [
+            ["Subsample", key_dim[0], hidden_sizes[0] // key_dim[0], 4, 2, 2],
+            ["Subsample", key_dim[0], hidden_sizes[1] // key_dim[0], 4, 2, 2],
+        ]
+
+
+# Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
+class LevitOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+
+__all__ = ["LevitConfig", "LevitOnnxConfig"]
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/feature_extraction_levit.py b/phivenv/Lib/site-packages/transformers/models/levit/feature_extraction_levit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d634239b24500fd279d41101188937548df4bfa2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/levit/feature_extraction_levit.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for LeViT."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_levit import LevitImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class LevitFeatureExtractor(LevitImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class LevitFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use LevitImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["LevitFeatureExtractor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/image_processing_levit.py b/phivenv/Lib/site-packages/transformers/models/levit/image_processing_levit.py
new file mode 100644
index 0000000000000000000000000000000000000000..7832dbe5f43b4985017337ebed3b85c0fc3df5c2
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/levit/image_processing_levit.py
@@ -0,0 +1,312 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LeViT."""
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class LevitImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LeViT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Wwhether to resize the shortest edge of the input to int(256/224 *`size`). Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]`, *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the output image after resizing. If size is a dict with keys "width" and "height", the image will
+            be resized to `(size["height"], size["width"])`. If size is a dict with key "shortest_edge", the shortest
+            edge value `c` is rescaled to `int(c * (256/224))`. The smaller edge of the image will be matched to this
+            value i.e, if height > width, then image will be rescaled to `(size["shortest_egde"] * height / width,
+            size["shortest_egde"])`. Can be overridden by the `size` parameter in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether or not to center crop the input to `(crop_size["height"], crop_size["width"])`. Can be overridden
+            by the `do_center_crop` parameter in the `preprocess` method.
+        crop_size (`Dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired image size after `center_crop`. Can be overridden by the `crop_size` parameter in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`list[int]`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`list[int]`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_MEAN,
+        image_std: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_STD,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        If size is a dict with keys "width" and "height", the image will be resized to `(size["height"],
+        size["width"])`.
+
+        If size is a dict with key "shortest_edge", the shortest edge value `c` is rescaled to `int(c * (256/224))`.
+        The smaller edge of the image will be matched to this value i.e, if height > width, then image will be rescaled
+        to `(size["shortest_egde"] * height / width, size["shortest_egde"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image after resizing. If size is a dict with keys "width" and "height", the image
+                will be resized to (height, width). If size is a dict with key "shortest_edge", the shortest edge value
+                `c` is rescaled to int(`c` * (256/224)). The smaller edge of the image will be matched to this value
+                i.e, if height > width, then image will be rescaled to (size * height / width, size).
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size_dict = get_size_dict(size, default_to_square=False)
+        # size_dict is a dict with either keys "height" and "width" or "shortest_edge"
+        if "shortest_edge" in size:
+            shortest_edge = int((256 / 224) * size["shortest_edge"])
+            output_size = get_resize_output_image_size(
+                image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
+            )
+            size_dict = {"height": output_size[0], "width": output_size[1]}
+        if "height" not in size_dict or "width" not in size_dict:
+            raise ValueError(
+                f"Size dict must have keys 'height' and 'width' or 'shortest_edge'. Got {size_dict.keys()}"
+            )
+        return resize(
+            image,
+            size=(size_dict["height"], size_dict["width"]),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        return_tensors: Optional[TensorType] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images to be used as input to a LeViT model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the output image after resizing. If size is a dict with keys "width" and "height", the image
+                will be resized to (height, width). If size is a dict with key "shortest_edge", the shortest edge value
+                `c` is rescaled to int(`c` * (256/224)). The smaller edge of the image will be matched to this value
+                i.e, if height > width, then image will be rescaled to (size * height / width, size).
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the output image after center cropping. Crops images to (crop_size["height"],
+                crop_size["width"]).
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image pixel values by `rescaling_factor` - typical to values between 0 and 1.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Factor to rescale the image pixel values by.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image pixel values by `image_mean` and `image_std`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to normalize the image pixel values by.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to normalize the image pixel values by.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [self.resize(image, size, resample, input_data_format=input_data_format) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image, crop_size, input_data_format=input_data_format) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["LevitImageProcessor"]
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/image_processing_levit_fast.py b/phivenv/Lib/site-packages/transformers/models/levit/image_processing_levit_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d8774ade8f3c22cf40960bdf2116d2e45cfccf7
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/levit/image_processing_levit_fast.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for LeViT."""
+
+from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
+from ...image_transforms import (
+    ChannelDimension,
+    get_resize_output_image_size,
+)
+from ...image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from ...utils import auto_docstring, is_torch_available, is_torchvision_available, is_torchvision_v2_available
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+@auto_docstring
+class LevitImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    size = {"shortest_edge": 224}
+    default_to_square = False
+    crop_size = {"height": 224, "width": 224}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = None
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize an image.
+
+        If size is a dict with keys "width" and "height", the image will be resized to `(size["height"],
+        size["width"])`.
+
+        If size is a dict with key "shortest_edge", the shortest edge value `c` is rescaled to `int(c * (256/224))`.
+        The smaller edge of the image will be matched to this value i.e, if height > width, then image will be rescaled
+        to `(size["shortest_egde"] * height / width, size["shortest_egde"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the output image after resizing. If size is a dict with keys "width" and "height", the image
+                will be resized to (height, width). If size is a dict with key "shortest_edge", the shortest edge value
+                `c` is rescaled to int(`c` * (256/224)). The smaller edge of the image will be matched to this value
+                i.e, if height > width, then image will be rescaled to (size * height / width, size).
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BICUBIC
+        if size.shortest_edge:
+            shortest_edge = int((256 / 224) * size["shortest_edge"])
+            new_size = get_resize_output_image_size(
+                image, size=shortest_edge, default_to_square=False, input_data_format=ChannelDimension.FIRST
+            )
+        elif size.height and size.width:
+            new_size = (size.height, size.width)
+        else:
+            raise ValueError(
+                f"Size dict must have keys 'height' and 'width' or 'shortest_edge'. Got {size.keys()} {size.keys()}."
+            )
+        return F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+
+
+__all__ = ["LevitImageProcessorFast"]
diff --git a/phivenv/Lib/site-packages/transformers/models/levit/modeling_levit.py b/phivenv/Lib/site-packages/transformers/models/levit/modeling_levit.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc275a1c4c40418641fc041416223f630fa8188c
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/levit/modeling_levit.py
@@ -0,0 +1,677 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LeViT model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+    ModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from .configuration_levit import LevitConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`LevitForImageClassificationWithTeacher`].
+    """
+)
+class LevitForImageClassificationWithTeacherOutput(ModelOutput):
+    r"""
+    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Prediction scores as the average of the `cls_logits` and `distillation_logits`.
+    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+        class token).
+    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+        distillation token).
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    distillation_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class LevitConvEmbeddings(nn.Module):
+    """
+    LeViT Conv Embeddings with Batch Norm, used in the initial patch embedding layer.
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride, padding, dilation=dilation, groups=groups, bias=False
+        )
+        self.batch_norm = nn.BatchNorm2d(out_channels)
+
+    def forward(self, embeddings):
+        embeddings = self.convolution(embeddings)
+        embeddings = self.batch_norm(embeddings)
+        return embeddings
+
+
+class LevitPatchEmbeddings(nn.Module):
+    """
+    LeViT patch embeddings, for final embeddings to be passed to transformer blocks. It consists of multiple
+    `LevitConvEmbeddings`.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.embedding_layer_1 = LevitConvEmbeddings(
+            config.num_channels, config.hidden_sizes[0] // 8, config.kernel_size, config.stride, config.padding
+        )
+        self.activation_layer_1 = nn.Hardswish()
+
+        self.embedding_layer_2 = LevitConvEmbeddings(
+            config.hidden_sizes[0] // 8, config.hidden_sizes[0] // 4, config.kernel_size, config.stride, config.padding
+        )
+        self.activation_layer_2 = nn.Hardswish()
+
+        self.embedding_layer_3 = LevitConvEmbeddings(
+            config.hidden_sizes[0] // 4, config.hidden_sizes[0] // 2, config.kernel_size, config.stride, config.padding
+        )
+        self.activation_layer_3 = nn.Hardswish()
+
+        self.embedding_layer_4 = LevitConvEmbeddings(
+            config.hidden_sizes[0] // 2, config.hidden_sizes[0], config.kernel_size, config.stride, config.padding
+        )
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values):
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.embedding_layer_1(pixel_values)
+        embeddings = self.activation_layer_1(embeddings)
+        embeddings = self.embedding_layer_2(embeddings)
+        embeddings = self.activation_layer_2(embeddings)
+        embeddings = self.embedding_layer_3(embeddings)
+        embeddings = self.activation_layer_3(embeddings)
+        embeddings = self.embedding_layer_4(embeddings)
+        return embeddings.flatten(2).transpose(1, 2)
+
+
+class MLPLayerWithBN(nn.Module):
+    def __init__(self, input_dim, output_dim, bn_weight_init=1):
+        super().__init__()
+        self.linear = nn.Linear(in_features=input_dim, out_features=output_dim, bias=False)
+        self.batch_norm = nn.BatchNorm1d(output_dim)
+
+    def forward(self, hidden_state):
+        hidden_state = self.linear(hidden_state)
+        hidden_state = self.batch_norm(hidden_state.flatten(0, 1)).reshape_as(hidden_state)
+        return hidden_state
+
+
+class LevitSubsample(nn.Module):
+    def __init__(self, stride, resolution):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def forward(self, hidden_state):
+        batch_size, _, channels = hidden_state.shape
+        hidden_state = hidden_state.view(batch_size, self.resolution, self.resolution, channels)[
+            :, :: self.stride, :: self.stride
+        ].reshape(batch_size, -1, channels)
+        return hidden_state
+
+
+class LevitAttention(nn.Module):
+    def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads * 2
+        self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
+
+        self.queries_keys_values = MLPLayerWithBN(hidden_sizes, self.out_dim_keys_values)
+        self.activation = nn.Hardswish()
+        self.projection = MLPLayerWithBN(self.out_dim_projection, hidden_sizes, bn_weight_init=0)
+
+        points = list(itertools.product(range(resolution), range(resolution)))
+        len_points = len(points)
+        attention_offsets, indices = {}, []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                indices.append(attention_offsets[offset])
+
+        self.attention_bias_cache = {}
+        self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
+        self.register_buffer(
+            "attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points), persistent=False
+        )
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.attention_bias_cache:
+            self.attention_bias_cache = {}  # clear ab cache
+
+    def get_attention_biases(self, device):
+        if self.training:
+            return self.attention_biases[:, self.attention_bias_idxs]
+        else:
+            device_key = str(device)
+            if device_key not in self.attention_bias_cache:
+                self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
+            return self.attention_bias_cache[device_key]
+
+    def forward(self, hidden_state):
+        batch_size, seq_length, _ = hidden_state.shape
+        queries_keys_values = self.queries_keys_values(hidden_state)
+        query, key, value = queries_keys_values.view(batch_size, seq_length, self.num_attention_heads, -1).split(
+            [self.key_dim, self.key_dim, self.attention_ratio * self.key_dim], dim=3
+        )
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+
+        attention = query @ key.transpose(-2, -1) * self.scale + self.get_attention_biases(hidden_state.device)
+        attention = attention.softmax(dim=-1)
+        hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, seq_length, self.out_dim_projection)
+        hidden_state = self.projection(self.activation(hidden_state))
+        return hidden_state
+
+
+class LevitAttentionSubsample(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        key_dim,
+        num_attention_heads,
+        attention_ratio,
+        stride,
+        resolution_in,
+        resolution_out,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
+        self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
+        self.resolution_out = resolution_out
+        # resolution_in is the initial resolution, resolution_out is final resolution after downsampling
+        self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
+        self.queries_subsample = LevitSubsample(stride, resolution_in)
+        self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
+        self.activation = nn.Hardswish()
+        self.projection = MLPLayerWithBN(self.out_dim_projection, output_dim)
+
+        self.attention_bias_cache = {}
+
+        points = list(itertools.product(range(resolution_in), range(resolution_in)))
+        points_ = list(itertools.product(range(resolution_out), range(resolution_out)))
+        len_points, len_points_ = len(points), len(points_)
+        attention_offsets, indices = {}, []
+        for p1 in points_:
+            for p2 in points:
+                size = 1
+                offset = (abs(p1[0] * stride - p2[0] + (size - 1) / 2), abs(p1[1] * stride - p2[1] + (size - 1) / 2))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                indices.append(attention_offsets[offset])
+
+        self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
+        self.register_buffer(
+            "attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points), persistent=False
+        )
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.attention_bias_cache:
+            self.attention_bias_cache = {}  # clear ab cache
+
+    def get_attention_biases(self, device):
+        if self.training:
+            return self.attention_biases[:, self.attention_bias_idxs]
+        else:
+            device_key = str(device)
+            if device_key not in self.attention_bias_cache:
+                self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
+            return self.attention_bias_cache[device_key]
+
+    def forward(self, hidden_state):
+        batch_size, seq_length, _ = hidden_state.shape
+        key, value = (
+            self.keys_values(hidden_state)
+            .view(batch_size, seq_length, self.num_attention_heads, -1)
+            .split([self.key_dim, self.attention_ratio * self.key_dim], dim=3)
+        )
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+
+        query = self.queries(self.queries_subsample(hidden_state))
+        query = query.view(batch_size, self.resolution_out**2, self.num_attention_heads, self.key_dim).permute(
+            0, 2, 1, 3
+        )
+
+        attention = query @ key.transpose(-2, -1) * self.scale + self.get_attention_biases(hidden_state.device)
+        attention = attention.softmax(dim=-1)
+        hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, -1, self.out_dim_projection)
+        hidden_state = self.projection(self.activation(hidden_state))
+        return hidden_state
+
+
+class LevitMLPLayer(nn.Module):
+    """
+    MLP Layer with `2X` expansion in contrast to ViT with `4X`.
+    """
+
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.linear_up = MLPLayerWithBN(input_dim, hidden_dim)
+        self.activation = nn.Hardswish()
+        self.linear_down = MLPLayerWithBN(hidden_dim, input_dim)
+
+    def forward(self, hidden_state):
+        hidden_state = self.linear_up(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.linear_down(hidden_state)
+        return hidden_state
+
+
+class LevitResidualLayer(nn.Module):
+    """
+    Residual Block for LeViT
+    """
+
+    def __init__(self, module, drop_rate):
+        super().__init__()
+        self.module = module
+        self.drop_rate = drop_rate
+
+    def forward(self, hidden_state):
+        if self.training and self.drop_rate > 0:
+            rnd = torch.rand(hidden_state.size(0), 1, 1, device=hidden_state.device)
+            rnd = rnd.ge_(self.drop_rate).div(1 - self.drop_rate).detach()
+            hidden_state = hidden_state + self.module(hidden_state) * rnd
+            return hidden_state
+        else:
+            hidden_state = hidden_state + self.module(hidden_state)
+            return hidden_state
+
+
+class LevitStage(nn.Module):
+    """
+    LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers.
+    """
+
+    def __init__(
+        self,
+        config,
+        idx,
+        hidden_sizes,
+        key_dim,
+        depths,
+        num_attention_heads,
+        attention_ratio,
+        mlp_ratio,
+        down_ops,
+        resolution_in,
+    ):
+        super().__init__()
+        self.layers = []
+        self.config = config
+        self.resolution_in = resolution_in
+        # resolution_in is the initial resolution, resolution_out is final resolution after downsampling
+        for _ in range(depths):
+            self.layers.append(
+                LevitResidualLayer(
+                    LevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in),
+                    self.config.drop_path_rate,
+                )
+            )
+            if mlp_ratio > 0:
+                hidden_dim = hidden_sizes * mlp_ratio
+                self.layers.append(
+                    LevitResidualLayer(LevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate)
+                )
+
+        if down_ops[0] == "Subsample":
+            self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1
+            self.layers.append(
+                LevitAttentionSubsample(
+                    *self.config.hidden_sizes[idx : idx + 2],
+                    key_dim=down_ops[1],
+                    num_attention_heads=down_ops[2],
+                    attention_ratio=down_ops[3],
+                    stride=down_ops[5],
+                    resolution_in=resolution_in,
+                    resolution_out=self.resolution_out,
+                )
+            )
+            self.resolution_in = self.resolution_out
+            if down_ops[4] > 0:
+                hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4]
+                self.layers.append(
+                    LevitResidualLayer(
+                        LevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate
+                    )
+                )
+
+        self.layers = nn.ModuleList(self.layers)
+
+    def get_resolution(self):
+        return self.resolution_in
+
+    def forward(self, hidden_state):
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class LevitEncoder(nn.Module):
+    """
+    LeViT Encoder consisting of multiple `LevitStage` stages.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        resolution = self.config.image_size // self.config.patch_size
+        self.stages = []
+        self.config.down_ops.append([""])
+
+        for stage_idx in range(len(config.depths)):
+            stage = LevitStage(
+                config,
+                stage_idx,
+                config.hidden_sizes[stage_idx],
+                config.key_dim[stage_idx],
+                config.depths[stage_idx],
+                config.num_attention_heads[stage_idx],
+                config.attention_ratio[stage_idx],
+                config.mlp_ratio[stage_idx],
+                config.down_ops[stage_idx],
+                resolution,
+            )
+            resolution = stage.get_resolution()
+            self.stages.append(stage)
+
+        self.stages = nn.ModuleList(self.stages)
+
+    def forward(self, hidden_state, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+
+        for stage in self.stages:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+            hidden_state = stage(hidden_state)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+        if not return_dict:
+            return tuple(v for v in [hidden_state, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
+
+
+class LevitClassificationLayer(nn.Module):
+    """
+    LeViT Classification Layer
+    """
+
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.batch_norm = nn.BatchNorm1d(input_dim)
+        self.linear = nn.Linear(input_dim, output_dim)
+
+    def forward(self, hidden_state):
+        hidden_state = self.batch_norm(hidden_state)
+        logits = self.linear(hidden_state)
+        return logits
+
+
+@auto_docstring
+class LevitPreTrainedModel(PreTrainedModel):
+    config: LevitConfig
+    base_model_prefix = "levit"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["LevitResidualLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class LevitModel(LevitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.patch_embeddings = LevitPatchEmbeddings(config)
+        self.encoder = LevitEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embeddings = self.patch_embeddings(pixel_values)
+        encoder_outputs = self.encoder(
+            embeddings,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes)
+        pooled_output = last_hidden_state.mean(dim=1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+class LevitForImageClassification(LevitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.levit = LevitModel(config)
+
+        # Classifier head
+        self.classifier = (
+            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            if config.num_labels > 0
+            else torch.nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        sequence_output = outputs[0]
+        sequence_output = sequence_output.mean(1)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    LeViT Model transformer with image classification heads on top (a linear layer on top of the final hidden state and
+    a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. .. warning::
+           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+           supported.
+    """
+)
+class LevitForImageClassificationWithTeacher(LevitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.levit = LevitModel(config)
+
+        # Classifier head
+        self.classifier = (
+            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            if config.num_labels > 0
+            else torch.nn.Identity()
+        )
+        self.classifier_distill = (
+            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            if config.num_labels > 0
+            else torch.nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, LevitForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        sequence_output = outputs[0]
+        sequence_output = sequence_output.mean(1)
+        cls_logits, distill_logits = self.classifier(sequence_output), self.classifier_distill(sequence_output)
+        logits = (cls_logits + distill_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distill_logits) + outputs[2:]
+            return output
+
+        return LevitForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distill_logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+__all__ = [
+    "LevitForImageClassification",
+    "LevitForImageClassificationWithTeacher",
+    "LevitModel",
+    "LevitPreTrainedModel",
+]
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/__init__.py b/phivenv/Lib/site-packages/transformers/models/lfm2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..239ab879832c57465eb85f910cadd0f373abe46a
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/lfm2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_lfm2 import *
+    from .modeling_lfm2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/__init__.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b7b8532740bdf1cfa8c0e3866c7e1f95c48505b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/__init__.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/configuration_lfm2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/configuration_lfm2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..076db787d64b37de8e645414fd0b2522b2b07a5b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/configuration_lfm2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/modeling_lfm2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/modeling_lfm2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1dac8cc587e002a9c2dbd1505060ca936507944
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/modeling_lfm2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/modular_lfm2.cpython-39.pyc b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/modular_lfm2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9ae9a8212e353326664ec3aa50d04fc5693c20b
Binary files /dev/null and b/phivenv/Lib/site-packages/transformers/models/lfm2/__pycache__/modular_lfm2.cpython-39.pyc differ
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/configuration_lfm2.py b/phivenv/Lib/site-packages/transformers/models/lfm2/configuration_lfm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce331a311a7ebd8c3b4719b35a4a7992b4f60a7f
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/lfm2/configuration_lfm2.py
@@ -0,0 +1,165 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+
+
+class Lfm2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Lfm2Model`]. It is used to instantiate a LFM2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LFM2-1.2B model.
+    e.g. [LiquidAI/LFM2-1.2B](https://huggingface.co/LiquidAI/LFM2-1.2B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Lfm2Model`]
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 12288):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with. Lfm2 1 supports up to 2048 tokens,
+            Lfm2 2 up to 4096, CodeLfm2 up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the conv layers.
+        conv_L_cache (`int`, *optional*, defaults to 3):
+            L_cache dim in the conv layers.
+        block_multiple_of (`int`, *optional*, defaults to 256):
+            Multiple for the `intermediate_size`.
+        block_ffn_dim_multiplier (`float`, *optional*, defaults to 1.0):
+            Multiplier for the `intermediate_size`.
+        block_auto_adjust_ff_dim (`bool`, *optional*, defaults to `True`):
+            Whether to adjust the dim of the `intermediate_size`.
+        full_attn_idxs (`Optional`, *optional*):
+            Index of the layers which use attention.
+        layer_types (`Optional`, *optional*):
+            Type of each layers.
+
+    ```python
+    >>> from transformers import Lfm2Model, Lfm2Config
+
+    >>> # Initializing a LFM2 model
+    >>> configuration = Lfm2Config()
+
+    >>> # Initializing a model from the LFM2-1.2B style configuration
+    >>> model = Lfm2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "lfm2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        hidden_size: int = 2560,
+        intermediate_size: int = 12288,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        max_position_embeddings: int = 128_000,
+        initializer_range: float = 0.02,
+        norm_eps: float = 0.00001,
+        use_cache: bool = True,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = True,
+        rope_theta: float = 1000000.0,
+        conv_bias: bool = False,
+        conv_L_cache: int = 3,
+        block_multiple_of: int = 256,
+        block_ffn_dim_multiplier: float = 1.0,
+        block_auto_adjust_ff_dim: bool = True,
+        full_attn_idxs: Optional[list[int]] = None,
+        layer_types: Optional[list[str]] = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.rope_theta = kwargs.get("theta", rope_theta)  # to fit original config keys
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.norm_eps = norm_eps
+        self.initializer_range = initializer_range
+
+        # attn operator config
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        # custom operator config
+        self.conv_bias = conv_bias
+        self.conv_L_cache = conv_L_cache
+
+        # MLP config
+        self.intermediate_size = kwargs.get("block_ff_dim", intermediate_size)  # to fit original config keys
+        self.block_multiple_of = block_multiple_of
+        self.block_ffn_dim_multiplier = block_ffn_dim_multiplier
+        self.block_auto_adjust_ff_dim = block_auto_adjust_ff_dim
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            full_attn_idxs = full_attn_idxs if full_attn_idxs is not None else list(range(num_hidden_layers))
+            self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)]
+
+        tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings)  # to fit original config keys
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Lfm2Config"]
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/modeling_lfm2.py b/phivenv/Lib/site-packages/transformers/models/lfm2/modeling_lfm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d802bd8e81a9a4be729255cbd947338951935887
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/lfm2/modeling_lfm2.py
@@ -0,0 +1,754 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/lfm2/modular_lfm2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_lfm2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from ...utils.import_utils import is_causal_conv1d_available
+from .configuration_lfm2 import Lfm2Config
+
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Lfm2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Lfm2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Lfm2RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Lfm2Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Lfm2MLP(nn.Module):
+    def __init__(self, config: Lfm2Config):
+        super().__init__()
+        intermediate_size = config.intermediate_size
+        if config.block_auto_adjust_ff_dim:
+            intermediate_size = int(2 * intermediate_size / 3)
+            # custom dim factor multiplier
+            if config.block_ffn_dim_multiplier is not None:
+                intermediate_size = int(config.block_ffn_dim_multiplier * intermediate_size)
+                intermediate_size = config.block_multiple_of * (
+                    (intermediate_size + config.block_multiple_of - 1) // config.block_multiple_of
+                )
+        self.w1 = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        self.w2 = nn.Linear(intermediate_size, config.hidden_size, bias=False)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class Lfm2HybridConvCache:
+    """
+    Attention and conv cache for Lfm2.
+
+    It stores the Key and Value states as a list of tensors, one for each layer.
+    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
+    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
+    """
+
+    # Override @property existing in Cache
+    max_batch_size = None
+    is_compileable = False
+    key_cache = None
+    value_cache = None
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        max_batch_size: int,
+        dtype: torch.dtype = torch.float32,
+        device: Union[torch.device, str, None] = None,
+    ):
+        self.key_cache = []
+        self.value_cache = []
+        self.max_batch_size = max_batch_size
+        self.layer_types = config.layer_types
+        self.first_attention_layer = self.layer_types.index("full_attention")
+        self.conv_L_cache = config.conv_L_cache
+        self._dtype = dtype
+
+        self.conv_cache: list[torch.Tensor] = []
+        device = torch.device(device) if device is not None else None
+
+        for _ in range(config.num_hidden_layers):
+            conv_state = torch.zeros(
+                self.max_batch_size,
+                config.hidden_size,
+                self.conv_L_cache,
+                dtype=self._dtype,
+                device=device,
+            )
+            torch._dynamo.mark_static_address(conv_state)
+            self.conv_cache.append(conv_state)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the cache
+        if key_states is not None:
+            if len(self.key_cache) <= layer_idx:
+                # There may be skipped layers, fill them with empty lists
+                for _ in range(len(self.key_cache), layer_idx):
+                    self.key_cache.append(torch.tensor([]))
+                    self.value_cache.append(torch.tensor([]))
+                self.key_cache.append(key_states)
+                self.value_cache.append(value_states)
+            elif (
+                not self.key_cache[layer_idx].numel()  # prefers not t.numel() to len(t) == 0 to export the model
+            ):  # fills previously skipped layers; checking for tensor causes errors
+                self.key_cache[layer_idx] = key_states
+                self.value_cache[layer_idx] = value_states
+            else:
+                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_cache[layer_idx].device
+            self.conv_cache[layer_idx] = self.conv_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.first_attention_layer if self.layer_types[layer_idx] != "full_attention" else layer_idx
+        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx].numel() == 0:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+        for each layer.
+        """
+        full_mask_kv_offset = 0
+        query_length = cache_position.shape[0]
+        past_seen_tokens = self.get_seq_length()
+        kv_length = query_length + past_seen_tokens
+        return kv_length, full_mask_kv_offset
+
+    def crop(self, max_length: int):
+        """Crop the cache to the given length"""
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+
+        if self.get_seq_length() <= max_length:
+            return
+
+        for idx in range(len(self.key_cache)):
+            if self.key_cache[idx].numel():
+                self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
+                self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
+
+    def __len__(self) -> int:
+        return len(self.key_cache)
+
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reset(self):
+        for layer_idx in range(len(self.conv_cache)):
+            # In-place ops prevent breaking the static address
+            self.conv_cache[layer_idx].zero_()
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Lfm2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Lfm2Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = True
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.out_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.q_layernorm = Lfm2RMSNorm(self.head_dim, eps=config.norm_eps)
+        self.k_layernorm = Lfm2RMSNorm(self.head_dim, eps=config.norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_layernorm(self.q_proj(hidden_states).view(*hidden_shape)).transpose(1, 2)
+        key_states = self.k_layernorm(self.k_proj(hidden_states).view(*hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(*hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        output = self.out_proj(attn_output)
+        return output, attn_weights
+
+
+def apply_mask_to_padding_states(hidden_states, attention_mask):
+    """
+    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+    """
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+    return hidden_states
+
+
+kernel_modules = (causal_conv1d_fn, causal_conv1d_update)
+is_fast_path_available = all(kernel_modules)
+
+
+class Lfm2ShortConv(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_idx: int,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.L_cache = config.conv_L_cache
+        self.bias = config.conv_bias
+
+        self.conv = nn.Conv1d(
+            in_channels=config.hidden_size,
+            out_channels=config.hidden_size,
+            kernel_size=self.L_cache,
+            groups=config.hidden_size,
+            bias=self.bias,
+            padding=self.L_cache - 1,
+        )
+        self.in_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=self.bias)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=self.bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def cuda_kernels_forward(
+        self,
+        x: torch.Tensor,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        x = apply_mask_to_padding_states(x, attention_mask)
+        BCx = self.in_proj(x).transpose(-1, -2)
+        B, C, x = BCx.chunk(3, dim=-2)
+
+        Bx = B * x
+
+        conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2))
+        if past_key_values is not None and cache_position[0] > 0:
+            conv_out = causal_conv1d_update(
+                Bx.squeeze(-1),
+                past_key_values.conv_cache[self.layer_idx],
+                conv_weights,
+                self.conv.bias,
+                None,
+            )
+            conv_out = conv_out.unsqueeze(-1)
+        else:
+            if past_key_values is not None:
+                conv_state = nn.functional.pad(Bx, (self.L_cache - Bx.shape[-1], 0))
+                past_key_values.conv_cache[self.layer_idx].copy_(conv_state)
+
+            conv_out = causal_conv1d_fn(Bx, conv_weights, self.conv.bias, activation=None)
+
+        y = C * conv_out
+        y = self.out_proj(y.transpose(-1, -2).contiguous())
+        return y
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def slow_forward(
+        self,
+        x: torch.Tensor,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        seqlen = x.shape[1]
+
+        x = apply_mask_to_padding_states(x, attention_mask)
+        BCx = self.in_proj(x).transpose(-1, -2)
+        B, C, x = BCx.chunk(3, dim=-2)
+
+        Bx = B * x
+
+        if past_key_values is not None and cache_position[0] > 0:
+            conv_state = past_key_values.conv_cache[self.layer_idx]
+            cache_position = cache_position.clamp(0, self.L_cache - 1)
+            conv_state = conv_state.roll(shifts=-1, dims=-1)
+            conv_state[:, :, cache_position] = Bx.to(device=conv_state.device, dtype=conv_state.dtype)
+            past_key_values.conv_cache[self.layer_idx].copy_(conv_state)
+            conv_out = torch.sum(conv_state.to(Bx.device) * self.conv.weight[:, 0, :], dim=-1)
+            if self.bias:
+                conv_out += self.conv.bias
+
+            conv_out = conv_out.unsqueeze(-1)
+        else:
+            if past_key_values is not None:
+                conv_state = nn.functional.pad(Bx, (self.L_cache - Bx.shape[-1], 0))
+                past_key_values.conv_cache[self.layer_idx].copy_(conv_state)
+
+            conv_out = self.conv(Bx)[..., :seqlen]
+
+        y = C * conv_out
+        y = y.transpose(-1, -2).contiguous()
+        y = self.out_proj(y)
+        return y
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in hidden_states.device.type and not torch._dynamo.is_compiling():
+            return self.cuda_kernels_forward(hidden_states, past_key_values, cache_position, attention_mask)
+        return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask)
+
+
+class Lfm2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Lfm2Config, layer_idx: int):
+        super().__init__()
+        self.is_attention_layer = config.layer_types[layer_idx] == "full_attention"
+
+        if self.is_attention_layer:
+            self.self_attn = Lfm2Attention(config, layer_idx)
+        else:
+            self.conv = Lfm2ShortConv(config, layer_idx)
+        self.feed_forward = Lfm2MLP(config)
+        self.operator_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if self.is_attention_layer:
+            hidden_states, _ = self.self_attn(
+                hidden_states=self.operator_norm(hidden_states),
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        else:
+            hidden_states = self.conv(
+                hidden_states=self.operator_norm(hidden_states),
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                attention_mask=attention_mask,
+            )
+        hidden_states = hidden_states + residual
+        hidden_states = hidden_states + self.feed_forward(self.ffn_norm(hidden_states))
+
+        return hidden_states
+
+
+@auto_docstring
+class Lfm2PreTrainedModel(PreTrainedModel):
+    config: Lfm2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Lfm2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Lfm2DecoderLayer,
+        "attentions": Lfm2Attention,
+    }
+
+
+@auto_docstring
+class Lfm2Model(Lfm2PreTrainedModel):
+    def __init__(self, config: Lfm2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Lfm2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.rotary_emb = Lfm2RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.pos_emb = Lfm2RotaryEmbedding(config)
+        self.embedding_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            batch_size = inputs_embeds.shape[0]
+            past_key_values = Lfm2HybridConvCache(
+                config=self.config, max_batch_size=batch_size, dtype=self.dtype, device=self.device
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.pos_emb(hidden_states, position_ids)
+
+        # decoder layers
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.embedding_norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class Lfm2ForCausalLM(Lfm2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Lfm2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Lfm2ForCausalLM
+
+        >>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["Lfm2ForCausalLM", "Lfm2Model", "Lfm2PreTrainedModel"]
diff --git a/phivenv/Lib/site-packages/transformers/models/lfm2/modular_lfm2.py b/phivenv/Lib/site-packages/transformers/models/lfm2/modular_lfm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d3791cbe3b1f0f15afb9a9bd1c3534780c3e6c3
--- /dev/null
+++ b/phivenv/Lib/site-packages/transformers/models/lfm2/modular_lfm2.py
@@ -0,0 +1,520 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import is_causal_conv1d_available
+from ..bamba.modeling_bamba import apply_mask_to_padding_states
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_lfm2 import Lfm2Config
+
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+
+
+kernel_modules = (causal_conv1d_fn, causal_conv1d_update)
+is_fast_path_available = all(kernel_modules)
+
+
+logger = logging.get_logger(__name__)
+
+
+class Lfm2RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class Lfm2RotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class Lfm2MLP(nn.Module):
+    def __init__(self, config: Lfm2Config):
+        super().__init__()
+        intermediate_size = config.intermediate_size
+        if config.block_auto_adjust_ff_dim:
+            intermediate_size = int(2 * intermediate_size / 3)
+            # custom dim factor multiplier
+            if config.block_ffn_dim_multiplier is not None:
+                intermediate_size = int(config.block_ffn_dim_multiplier * intermediate_size)
+                intermediate_size = config.block_multiple_of * (
+                    (intermediate_size + config.block_multiple_of - 1) // config.block_multiple_of
+                )
+        self.w1 = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        self.w2 = nn.Linear(intermediate_size, config.hidden_size, bias=False)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class Lfm2HybridConvCache:
+    """
+    Attention and conv cache for Lfm2.
+
+    It stores the Key and Value states as a list of tensors, one for each layer.
+    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
+    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
+    """
+
+    # Override @property existing in Cache
+    max_batch_size = None
+    is_compileable = False
+    key_cache = None
+    value_cache = None
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        max_batch_size: int,
+        dtype: torch.dtype = torch.float32,
+        device: Union[torch.device, str, None] = None,
+    ):
+        self.key_cache = []
+        self.value_cache = []
+        self.max_batch_size = max_batch_size
+        self.layer_types = config.layer_types
+        self.first_attention_layer = self.layer_types.index("full_attention")
+        self.conv_L_cache = config.conv_L_cache
+        self._dtype = dtype
+
+        self.conv_cache: list[torch.Tensor] = []
+        device = torch.device(device) if device is not None else None
+
+        for _ in range(config.num_hidden_layers):
+            conv_state = torch.zeros(
+                self.max_batch_size,
+                config.hidden_size,
+                self.conv_L_cache,
+                dtype=self._dtype,
+                device=device,
+            )
+            torch._dynamo.mark_static_address(conv_state)
+            self.conv_cache.append(conv_state)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the cache
+        if key_states is not None:
+            if len(self.key_cache) <= layer_idx:
+                # There may be skipped layers, fill them with empty lists
+                for _ in range(len(self.key_cache), layer_idx):
+                    self.key_cache.append(torch.tensor([]))
+                    self.value_cache.append(torch.tensor([]))
+                self.key_cache.append(key_states)
+                self.value_cache.append(value_states)
+            elif (
+                not self.key_cache[layer_idx].numel()  # prefers not t.numel() to len(t) == 0 to export the model
+            ):  # fills previously skipped layers; checking for tensor causes errors
+                self.key_cache[layer_idx] = key_states
+                self.value_cache[layer_idx] = value_states
+            else:
+                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_cache[layer_idx].device
+            self.conv_cache[layer_idx] = self.conv_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.first_attention_layer if self.layer_types[layer_idx] != "full_attention" else layer_idx
+        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx].numel() == 0:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+        for each layer.
+        """
+        full_mask_kv_offset = 0
+        query_length = cache_position.shape[0]
+        past_seen_tokens = self.get_seq_length()
+        kv_length = query_length + past_seen_tokens
+        return kv_length, full_mask_kv_offset
+
+    def crop(self, max_length: int):
+        """Crop the cache to the given length"""
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+
+        if self.get_seq_length() <= max_length:
+            return
+
+        for idx in range(len(self.key_cache)):
+            if self.key_cache[idx].numel():
+                self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
+                self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
+
+    def __len__(self) -> int:
+        return len(self.key_cache)
+
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reset(self):
+        for layer_idx in range(len(self.conv_cache)):
+            # In-place ops prevent breaking the static address
+            self.conv_cache[layer_idx].zero_()
+
+
+class Lfm2Attention(LlamaAttention):
+    def __init__(self, config: Lfm2Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.out_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.q_layernorm = Lfm2RMSNorm(self.head_dim, eps=config.norm_eps)
+        self.k_layernorm = Lfm2RMSNorm(self.head_dim, eps=config.norm_eps)
+        del self.o_proj
+        del self.attention_dropout
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_layernorm(self.q_proj(hidden_states).view(*hidden_shape)).transpose(1, 2)
+        key_states = self.k_layernorm(self.k_proj(hidden_states).view(*hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(*hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        output = self.out_proj(attn_output)
+        return output, attn_weights
+
+
+class Lfm2ShortConv(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_idx: int,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.L_cache = config.conv_L_cache
+        self.bias = config.conv_bias
+
+        self.conv = nn.Conv1d(
+            in_channels=config.hidden_size,
+            out_channels=config.hidden_size,
+            kernel_size=self.L_cache,
+            groups=config.hidden_size,
+            bias=self.bias,
+            padding=self.L_cache - 1,
+        )
+        self.in_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=self.bias)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=self.bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def cuda_kernels_forward(
+        self,
+        x: torch.Tensor,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        x = apply_mask_to_padding_states(x, attention_mask)
+        BCx = self.in_proj(x).transpose(-1, -2)
+        B, C, x = BCx.chunk(3, dim=-2)
+
+        Bx = B * x
+
+        conv_weights = self.conv.weight.view(self.conv.weight.size(0), self.conv.weight.size(2))
+        if past_key_values is not None and cache_position[0] > 0:
+            conv_out = causal_conv1d_update(
+                Bx.squeeze(-1),
+                past_key_values.conv_cache[self.layer_idx],
+                conv_weights,
+                self.conv.bias,
+                None,
+            )
+            conv_out = conv_out.unsqueeze(-1)
+        else:
+            if past_key_values is not None:
+                conv_state = nn.functional.pad(Bx, (self.L_cache - Bx.shape[-1], 0))
+                past_key_values.conv_cache[self.layer_idx].copy_(conv_state)
+
+            conv_out = causal_conv1d_fn(Bx, conv_weights, self.conv.bias, activation=None)
+
+        y = C * conv_out
+        y = self.out_proj(y.transpose(-1, -2).contiguous())
+        return y
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def slow_forward(
+        self,
+        x: torch.Tensor,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        seqlen = x.shape[1]
+
+        x = apply_mask_to_padding_states(x, attention_mask)
+        BCx = self.in_proj(x).transpose(-1, -2)
+        B, C, x = BCx.chunk(3, dim=-2)
+
+        Bx = B * x
+
+        if past_key_values is not None and cache_position[0] > 0:
+            conv_state = past_key_values.conv_cache[self.layer_idx]
+            cache_position = cache_position.clamp(0, self.L_cache - 1)
+            conv_state = conv_state.roll(shifts=-1, dims=-1)
+            conv_state[:, :, cache_position] = Bx.to(device=conv_state.device, dtype=conv_state.dtype)
+            past_key_values.conv_cache[self.layer_idx].copy_(conv_state)
+            conv_out = torch.sum(conv_state.to(Bx.device) * self.conv.weight[:, 0, :], dim=-1)
+            if self.bias:
+                conv_out += self.conv.bias
+
+            conv_out = conv_out.unsqueeze(-1)
+        else:
+            if past_key_values is not None:
+                conv_state = nn.functional.pad(Bx, (self.L_cache - Bx.shape[-1], 0))
+                past_key_values.conv_cache[self.layer_idx].copy_(conv_state)
+
+            conv_out = self.conv(Bx)[..., :seqlen]
+
+        y = C * conv_out
+        y = y.transpose(-1, -2).contiguous()
+        y = self.out_proj(y)
+        return y
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in hidden_states.device.type and not torch._dynamo.is_compiling():
+            return self.cuda_kernels_forward(hidden_states, past_key_values, cache_position, attention_mask)
+        return self.slow_forward(hidden_states, past_key_values, cache_position, attention_mask)
+
+
+class Lfm2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Lfm2Config, layer_idx: int):
+        super().__init__()
+        self.is_attention_layer = config.layer_types[layer_idx] == "full_attention"
+
+        if self.is_attention_layer:
+            self.self_attn = Lfm2Attention(config, layer_idx)
+        else:
+            self.conv = Lfm2ShortConv(config, layer_idx)
+        self.feed_forward = Lfm2MLP(config)
+        self.operator_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[torch.Tensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if self.is_attention_layer:
+            hidden_states, _ = self.self_attn(
+                hidden_states=self.operator_norm(hidden_states),
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        else:
+            hidden_states = self.conv(
+                hidden_states=self.operator_norm(hidden_states),
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                attention_mask=attention_mask,
+            )
+        hidden_states = hidden_states + residual
+        hidden_states = hidden_states + self.feed_forward(self.ffn_norm(hidden_states))
+
+        return hidden_states
+
+
+class Lfm2PreTrainedModel(LlamaPreTrainedModel):
+    _can_compile_fullgraph = False
+
+
+class Lfm2Model(LlamaModel):
+    def __init__(self, config: Lfm2Config):
+        super().__init__(config)
+        self.pos_emb = Lfm2RotaryEmbedding(config)
+        self.embedding_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps)
+        del self.norm
+        del self.rotary_emv
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Lfm2HybridConvCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            batch_size = inputs_embeds.shape[0]
+            past_key_values = Lfm2HybridConvCache(
+                config=self.config, max_batch_size=batch_size, dtype=self.dtype, device=self.device
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.pos_emb(hidden_states, position_ids)
+
+        # decoder layers
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.embedding_norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+class Lfm2ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+__all__ = ["Lfm2ForCausalLM", "Lfm2Model", "Lfm2PreTrainedModel"]